Build quantize binary in setup.py

tiran · tiran · commit fa9f88c66df1 · 2025-02-27T13:07:59.000+01:00
- remove pre-built binaries from git
- create a proper sdist with sources
- add `setup.py` to compile `quantize` binary for current platform
- update to latest release of `llama.cpp`

Signed-off-by: Christian Heimes &lt;cheimes@redhat.com&gt;
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -14,26 +14,65 @@ on:
         types:
             - published
 
-permissions:
-    # allow gh release upload
-    contents: write
-    # see https://docs.pypi.org/trusted-publishers/
-    id-token: write
-
 jobs:
     build-package:
         name: Build and check packages
         runs-on: ubuntu-latest
+        if: 0
         steps:
             - uses: actions/checkout@v4
               with:
                   # for setuptools-scm
                   fetch-depth: 0
+                  submodules: true
 
             - uses: hynek/build-and-inspect-python-package@v2
 
+    build_wheels:
+        name: Build wheels on ${{ matrix.os }}
+        runs-on: ${{ matrix.os }}
+        strategy:
+            matrix:
+                # macos-13 is an intel runner, macos-14 is apple silicon
+                os: [ubuntu-latest, ubuntu-24.04-arm, macos-14]
+
+        steps:
+            - uses: actions/checkout@v4
+              with:
+                  # for setuptools-scm
+                  fetch-depth: 0
+                  submodules: true
+
+            - name: Build wheels
+              uses: pypa/cibuildwheel@v2.22.0
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+                  path: ./wheelhouse/*.whl
+
+    build_sdist:
+        name: Build source distribution
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+              with:
+                  # for setuptools-scm
+                  fetch-depth: 0
+                  submodules: true
+
+            - name: Build sdist
+              run: pipx run build --sdist
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: cibw-sdist
+                  path: dist/*.tar.gz
+
     publish-test-pypi:
         name: Publish packages to test.pypi.org
+        permissions:
+            id-token: write
         # environment: publish-test-pypi
         # TODO: move to instructlab
         if: |
@@ -48,8 +87,9 @@ jobs:
             - name: Fetch build artifacts
               uses: actions/download-artifact@v4
               with:
-                  name: Packages
+                  pattern: cibw-*
                   path: dist
+                  merge-multiple: true
 
             - name: Upload to Test PyPI
               uses: pypa/gh-action-pypi-publish@release/v1
@@ -60,17 +100,19 @@ jobs:
         name: Publish release to pypi.org
         # environment: publish-pypi
         # TODO: move to instructlab
-        if: |
-            github.repository_owner == 'tiran' && github.event.action == 'published'
+        #if: |
+        #    github.repository_owner == 'tiran' && github.event.action == 'published'
+        if: 0
         runs-on: ubuntu-latest
         needs: build-package
 
         steps:
             - name: Fetch build artifacts
               uses: actions/download-artifact@v4
               with:
-                  name: Packages
+                  pattern: cibw-*
                   path: dist
+                  merge-multiple: true
 
             - uses: sigstore/gh-action-sigstore-python@v2.1.1
               with:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,19 +26,21 @@ jobs:
                     - "3.10"
                     - "3.11"
                     - "3.12"
-                    - "3.13-dev"
+                    - "3.13"
         steps:
             - uses: "actions/checkout@v4"
               with:
-                submodules: true
+                  submodules: true
+                  # for setuptools-scm
+                  fetch-depth: 0
 
             - uses: "actions/setup-python@v5"
               with:
                   python-version: "${{ matrix.python-version }}"
                   allow-prereleases: true
 
             - name: "Update pip"
-              run: python -m pip install --upgrade pip setuptools wheel
+              run: python -m pip install --upgrade pip
 
             - name: "Install tox dependencies"
               run: python -m pip install --upgrade tox tox-gh-actions
@@ -55,7 +57,7 @@ jobs:
                 submodules: true
 
             - name: "Update pip"
-              run: python -m pip install --upgrade pip setuptools wheel
+              run: python -m pip install --upgrade pip
 
             - name: "Install tox dependencies"
               run: python -m pip install --upgrade tox
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,24 @@
+include tox.ini tests.py .pylintrc
+recursive-include llama.cpp *
+exclude llama.cpp/.git
+
+global-exclude gguf.inp gguf.out
+exclude llama.cpp/models/ggml-vocab-aquila.gguf
+exclude llama.cpp/models/ggml-vocab-baichuan.gguf
+exclude llama.cpp/models/ggml-vocab-bert-bge.gguf
+exclude llama.cpp/models/ggml-vocab-command-r.gguf
+exclude llama.cpp/models/ggml-vocab-deepseek-coder.gguf
+exclude llama.cpp/models/ggml-vocab-deepseek-llm.gguf
+exclude llama.cpp/models/ggml-vocab-falcon.gguf
+exclude llama.cpp/models/ggml-vocab-gpt2.gguf
+exclude llama.cpp/models/ggml-vocab-gpt-neox.gguf
+# used in tests.py
+# exclude llama.cpp/models/ggml-vocab-llama.gguf
+exclude llama.cpp/models/ggml-vocab-mpt.gguf
+exclude llama.cpp/models/ggml-vocab-llama-bpe.gguf
+exclude llama.cpp/models/ggml-vocab-llama-spm.gguf
+exclude llama.cpp/models/ggml-vocab-phi-3.gguf
+exclude llama.cpp/models/ggml-vocab-qwen2.gguf
+exclude llama.cpp/models/ggml-vocab-refact.gguf
+exclude llama.cpp/models/ggml-vocab-stablelm-3b-4e1t.gguf
+exclude llama.cpp/models/ggml-vocab-starcoder.gguf
diff --git a/Makefile b/Makefile
@@ -1,16 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
-CMAKE_ARGS ?=
+CMAKE_ARGS ?= -GNinja \
+	-DCMAKE_BUILD_TYPE=Release \
+	-DBUILD_SHARED_LIBS=OFF \
+	-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF \
+	-DLLAMA_BUILD_TESTS=OFF \
+	-DLLAMA_BUILD_SERVER=OFF
 
 UNAME_MACHINE = $(shell uname -m | tr A-Z a-z)
 UNAME_OS = $(shell uname -s | tr A-Z a-z)
 QUANTIZE = build/quantize-$(UNAME_MACHINE)-$(UNAME_OS)
 LLAMA_BUILDDIR = build/llama.cpp-$(UNAME_MACHINE)-$(UNAME_OS)
 LLAMA_DIR = llama.cpp
 
-
 .PHONY: all
-all: test $(QUANTIZE)
+all:
 
 .PHONY: test
 test:
@@ -23,17 +27,21 @@ fix:
 
 .PHONY: clean
 clean:
-	rm -rf .tox .ruff_cache dist build
+	rm -rf dist build
+
+.PHONY: realclean
+realclean: clean
+	rm -rf .tox .ruff_cache .mypy_cache
 
-$(LLAMA_BUILDDIR)/Makefile: $(LLAMA_DIR)/CMakeLists.txt
+$(LLAMA_BUILDDIR)/Makefile: $(LLAMA_DIR)/CMakeLists.txt $(MAKEFILE_LIST)
 	@mkdir -p $(dir $@)
-	CMAKE_ARGS="$(CMAKE_ARGS)" cmake -S $(dir $<) -B $(dir $@)
+	cmake -S $(dir $<) -B $(dir $@) $(CMAKE_ARGS)
 
-$(LLAMA_BUILDDIR)/bin/quantize: $(LLAMA_BUILDDIR)/Makefile
-	cmake --build $(dir $<) --parallel 2 --config Release --target quantize
+$(LLAMA_BUILDDIR)/bin/llama-quantize: $(LLAMA_BUILDDIR)/Makefile
+	cmake --build $(dir $<) --config Release --target llama-quantize
 
 .PHONY: quantize
 quantize: $(QUANTIZE)
 
-$(QUANTIZE): $(LLAMA_BUILDDIR)/bin/quantize
+$(QUANTIZE): $(LLAMA_BUILDDIR)/bin/llama-quantize
 	cp -a $< $@
diff --git a/llama.cpp b/llama.cpp
@@ -1 +1 @@
-Subproject commit 784e11dea1f5ce9638851b2b0dddb107e2a609c8
+Subproject commit b95c8af37ccf169b0a3216b7ed691af0534e5091
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [build-system]
-requires = ["setuptools>=64", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
diff --git a/setup.py b/setup.py
@@ -0,0 +1,117 @@
+import os
+import platform
+import subprocess
+import sys
+
+from setuptools import setup
+from setuptools.command.build_py import build_py
+from setuptools.dist import Distribution
+from wheel.bdist_wheel import bdist_wheel as bdist_wheel
+
+CMAKE_ARGS = [
+    "-GNinja",
+    "-DCMAKE_BUILD_TYPE=Release",
+    "-DBUILD_SHARED_LIBS=OFF",
+    # build with base ISA
+    "-DGGML_NATIVE=OFF",
+    "-DLLAMA_NATIVE=OFF",
+    "-DLLAMA_BUILD_TESTS=OFF",
+    "-DLLAMA_BUILD_SERVER=OFF",
+]
+CMAKE_ARGS_X86_64 = [
+    # force x86_64-v2 ISA
+    "-DGGML_AVX=OFF",
+    "-DGGML_AVX2=OFF",
+    "-DGGML_FMA=OFF",
+    "-DGGML_F16C=OFF",
+    "-DLLAMA_AVX=OFF",
+    "-DLLAMA_AVX2=OFF",
+    "-DLLAMA_FMA=OFF",
+    "-DLLAMA_F16C=OFF",
+]
+CMAKE_ARGS_DARWIN_AARCH64 = [
+    # build and embed METAL on Apple M
+    "-DGGML_METAL=ON",
+    "-DGGML_METAL_EMBED_LIBRARY=ON",
+    "-DLLAMA_METAL=ON",
+    "-DLLAMA_METAL_EMBED_LIBRARY=ON",
+]
+QUANTIZE_BINARY = "llama-quantize"
+
+
+class Py3NoneBdistWheel(bdist_wheel):
+    """Tag wheel as py3-none-{tag}"""
+
+    def finalize_options(self) -> None:
+        super().finalize_options()
+        self.root_is_pure = False
+
+    def get_tag(self) -> tuple[str, str, str]:
+        _py, _abi, plat_name = super().get_tag()
+        return "py3", "none", plat_name
+
+
+class QuantizeBuildPy(build_py):
+    """Hack to build and copy quantize binary with Python files"""
+
+    def build_quantize(self) -> None:
+        # Switch to scikit-build-core? I have not found an example how to
+        # ship a program with scikit-build-core.
+        arch = platform.uname().machine
+        build_cmd = self.get_finalized_command("build")
+        package_name = self.distribution.packages[0]
+        build_temp = build_cmd.build_temp
+        cmake_args = [
+            "cmake",
+            "-S",
+            "llama.cpp",
+            "-B",
+            build_temp,
+        ]
+        cmake_args.extend(CMAKE_ARGS)
+        if sys.platform == "darwin" and arch == "aarch64":
+            cmake_args.extend(CMAKE_ARGS_DARWIN_AARCH64)
+        elif arch == "x86_64":
+            cmake_args.extend(CMAKE_ARGS_X86_64)
+        print(f"Run {' '.join(cmake_args)}")
+        subprocess.check_call(cmake_args)
+
+        build_args = [
+            "cmake",
+            "--build",
+            build_temp,
+            "--config",
+            "Release",
+            "--target",
+            QUANTIZE_BINARY,
+        ]
+        print(f"Run {' '.join(build_args)}")
+        subprocess.check_call(build_args)
+
+        infile = os.path.join(build_temp, "bin", QUANTIZE_BINARY)
+        outname = f"quantize-{arch}-{sys.platform}"
+        outfile = os.path.join(self.build_lib, package_name, outname)
+        directory = os.path.dirname(outfile)
+        os.makedirs(directory, exist_ok=True)
+        self.copy_file(infile, outfile, preserve_mode=True)
+        self.package_data[package_name] = [outname]
+
+    def run(self) -> None:
+        self.build_quantize()
+        return super().run()
+
+
+class BinaryDistribution(Distribution):
+    """Mark package has platlib package"""
+
+    def has_ext_modules(foo) -> bool:
+        return True
+
+
+setup(
+    distclass=BinaryDistribution,
+    cmdclass={
+        "bdist_wheel": Py3NoneBdistWheel,
+        "build_py": QuantizeBuildPy,
+    },
+)
diff --git a/src/instructlab_quantize/quantize-aarch64-linux b/src/instructlab_quantize/quantize-aarch64-linux
diff --git a/src/instructlab_quantize/quantize-arm64-darwin b/src/instructlab_quantize/quantize-arm64-darwin
diff --git a/src/instructlab_quantize/quantize-x86_64-linux b/src/instructlab_quantize/quantize-x86_64-linux
diff --git a/tests.py b/tests.py
@@ -7,9 +7,10 @@
 import sys
 from unittest import mock
 
-import instructlab_quantize
 import pytest
 
+import instructlab_quantize
+
 PKG_DIR = pathlib.Path(instructlab_quantize.__file__).absolute().parent
 
 
diff --git a/tox.ini b/tox.ini
@@ -41,6 +41,16 @@ deps =
 commands =
     ruff format {posargs:--check}
 
+[testenv:fix]
+description = fix code with Ruff
+skip_install = True
+skipsdist = true
+deps =
+    ruff
+commands =
+    ruff format
+    ruff check --fix
+
 [gh-actions]
 python =
     3.9: py39