vaexio · 2maz · Dec 18, 2024 · Jan 13, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -23,10 +23,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-        exclude:
-          - {os: windows-latest, python-version: "3.8"}
-          - {os: macos-latest, python-version: "3.8"}
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v2
@@ -100,6 +97,7 @@ jobs:
 
     - name: Check ml spec
       run: |
+        pip install dask[dataframe]
         python -m vaex.ml.spec packages/vaex-ml/vaex/ml/spec_new.json
         diff packages/vaex-ml/vaex/ml/spec_new.json packages/vaex-ml/vaex/ml/spec.json
 
@@ -127,7 +125,7 @@ jobs:
         PROJECT_ID: ${{ secrets.GCP_PROJECT_ID_VAEX }}
       run: ./ci/06-run-contrib-tests.sh
 
-    - name: Test comand line
+    - name: Test command line
       run: |
         vaex convert ~/.vaex/data/helmi-dezeeuw-2000-FeH-v2-10percent.hdf5 test.parquet
         VAEX_PROGRESS_TYPE=rich vaex convert ~/.vaex/data/helmi-dezeeuw-2000-FeH-v2-10percent.hdf5 test.parquet

diff --git a/bin/install_pcre.sh b/bin/install_pcre.sh
@@ -85,10 +85,32 @@ function build_simple {
     fi
     # touch "${name}-stamp"
 }
+
+
 function build_pcre {
-    echo "Build"
+    echo "Build pcre"
     echo $ARCHFLAGS
     build_simple pcre $PCRE_VERSION http://ftp.exim.org/pub/pcre/
 }
-echo Build pcre
-build_pcre
+
+function install_precompiled() {
+    # Mac https://formulae.brew.sh/formula/pcre
+    # DebianUbuntu https://packages.ubuntu.com/libpcre3-dev
+    # Alpine https://pkgs.alpinelinux.org/package/edge/main/x86_64/pcre
+    # RHEL https://git.almalinux.org/rpms/pcre
+    if [ -n "$(which brew)" ]; then
+        brew install pcre
+    elif [ -n "$(which apt)" ]; then
+        apt update
+        apt install -y libpcre3-dev
+    elif [ -n "$(which apk)" ]; then
+        apk add --update pcre
+    elif [ -n "$(which dnf)" ]; then
+        dnf --setopt install_weak_deps=false -y install pcre
+    else
+        false
+    fi
+}
+
+echo "Install pcre"
+install_precompiled || build_pcre
diff --git a/docs/source/tutorial_ml.ipynb b/docs/source/tutorial_ml.ipynb
@@ -1190,7 +1190,7 @@
    "source": [
     "## Supervised learning\n",
     "\n",
-    "While `vaex.ml` does not yet implement any supervised machine learning models, it does provide wrappers to several popular libraries such as [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [LightGBM](https://lightgbm.readthedocs.io/) and [CatBoost](https://catboost.ai/). \n",
+    "While `vaex.ml` does not yet implement any supervised machine learning models, it does provide wrappers to several popular libraries such as [scikit-learn](https://scikit-learn.org/), [XGBoost](https://xgboost.readthedocs.io/), [LightGBM](https://lightgbm.readthedocs.io/) and [CatBoost](https://catboost.ai/) (latter only with numpy < 2). \n",
     "\n",
     "The main benefit of these wrappers is that they turn the models into `vaex.ml` transformers. This means the models become part of the DataFrame _state_ and thus can be serialized, and their predictions can be returned as _virtual columns_. This is especially useful for creating various diagnostic plots and evaluating performance metrics at no memory cost, as well as building ensembles. \n",
     "\n",
@@ -1467,7 +1467,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `CatBoost` example\n",
+    "### `CatBoost` example (numpy < 2 only)\n",
     "\n",
     "The CatBoost library supports summing up models. With this feature, we can use CatBoost to train a model using data that is otherwise too large to fit in memory. The idea is to train a single CatBoost model per chunk of data, and than sum up the invidiual models to create a master model. To use this feature via `vaex.ml` just specify the `batch_size` argument in the `CatBoostModel` wrapper. One can also specify additional options such as the strategy on how to sum up the individual models, or how they should be weighted."
    ]
@@ -1542,35 +1542,37 @@
     }
    ],
    "source": [
-    "from vaex.ml.catboost import CatBoostModel\n",
-    "\n",
-    "df = vaex.datasets.iris_1e8()\n",
-    "df_train, df_test = df.ml.train_test_split(test_size=0.2, verbose=False)\n",
-    "\n",
-    "features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']\n",
-    "target = 'class_'\n",
-    "\n",
-    "params = {\n",
-    "    'leaf_estimation_method': 'Gradient',\n",
-    "    'learning_rate': 0.1,\n",
-    "    'max_depth': 3,\n",
-    "    'bootstrap_type': 'Bernoulli',\n",
-    "    'subsample': 0.8,\n",
-    "    'sampling_frequency': 'PerTree',\n",
-    "    'colsample_bylevel': 0.8,\n",
-    "    'reg_lambda': 1,\n",
-    "    'objective': 'MultiClass',\n",
-    "    'eval_metric': 'MultiClass',\n",
-    "    'random_state': 42,\n",
-    "    'verbose': 0,\n",
-    "}\n",
-    "\n",
-    "booster = CatBoostModel(features=features, target=target, num_boost_round=23, \n",
-    "                        params=params, prediction_type='Class', batch_size=11_000_000)\n",
-    "booster.fit(df=df_train, progress='widget')\n",
-    "\n",
-    "df_test = booster.transform(df_train)\n",
-    "df_test"
+    "import numpy as np\\n",
+    "if np.lib.NumpyVersion(np.__version__) < '2.0.0':\n",
+    "    from vaex.ml.catboost import CatBoostModel\n",
+    "\n",
+    "    df = vaex.datasets.iris_1e8()\n",
+    "    df_train, df_test = df.ml.train_test_split(test_size=0.2, verbose=False)\n",
+    "\n",
+    "    features = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']\n",
+    "    target = 'class_'\n",
+    "\n",
+    "    params = {\n",
+    "        'leaf_estimation_method': 'Gradient',\n",
+    "        'learning_rate': 0.1,\n",
+    "        'max_depth': 3,\n",
+    "        'bootstrap_type': 'Bernoulli',\n",
+    "        'subsample': 0.8,\n",
+    "        'sampling_frequency': 'PerTree',\n",
+    "        'colsample_bylevel': 0.8,\n",
+    "        'reg_lambda': 1,\n",
+    "        'objective': 'MultiClass',\n",
+    "        'eval_metric': 'MultiClass',\n",
+    "        'random_state': 42,\n",
+    "        'verbose': 0,\n",
+    "    }\n",
+    "\n",
+    "    booster = CatBoostModel(features=features, target=target, num_boost_round=23, \n",
+    "                            params=params, prediction_type='Class', batch_size=11_000_000)\n",
+    "    booster.fit(df=df_train, progress='widget')\n",
+    "\n",
+    "    df_test = booster.transform(df_train)\n",
+    "    df_test"
    ]
   },
   {

diff --git a/packages/vaex-astro/setup.py b/packages/vaex-astro/setup.py
@@ -12,7 +12,7 @@
 license     = 'MIT'
 version     = version.__version__
 url         = 'https://www.github.com/maartenbreddels/vaex'
-install_requires_astro = ['vaex-core~=4.5', 'astropy']
+install_requires_astro = ['vaex-core~=4.5', 'astropy>=6.0.0']
 
 setup(
     name=name + '-astro',

diff --git a/packages/vaex-astro/vaex/astro/tap.py b/packages/vaex-astro/vaex/astro/tap.py
@@ -81,7 +81,7 @@ def __getitem__(self, slice):
         'DOUBLE':np.float64,
         'BIGINT':np.int64,
         'INTEGER':np.int32,
-        'BOOLEAN':np.bool8
+        'BOOLEAN': np.bool_ if np.lib.NumpyVersion(np.__version__) >= '1.24.0' else np.bool8
     }
     #not supported types yet 'VARCHAR',', u'BOOLEAN', u'INTEGER', u'CHAR
     def __init__(self, tap_url="http://gaia.esac.esa.int/tap-server/tap/g10_smc", table_name=None):

diff --git a/packages/vaex-core/pyproject.toml b/packages/vaex-core/pyproject.toml
@@ -1,8 +1,10 @@
 [build-system]
 # Minimum requirements for the build system to execute.
 requires = [
-    "oldest-supported-numpy; python_version=='3.8'", # deprecated ref https://github.com/scipy/oldest-supported-numpy
-    "numpy~=1.25; python_version>'3.8'",  # numpy~=2.0 fails, backward compatible build-system as of v1.25 ref https://numpy.org/doc/2.1/dev/depending_on_numpy.html#build-time-dependency
+    # python 3.9 supported in numpy v2.0
+    # python 3.13 supported as of numpy v2.1
+    # ref https://github.com/scipy/oldest-supported-numpy/pull/86
+    "numpy~=2.0",
     "scikit-build",
     "cmake",
     "ninja"

diff --git a/packages/vaex-core/setup.py b/packages/vaex-core/setup.py
@@ -23,38 +23,37 @@
 license = "MIT"
 version = version.__version__
 url = "https://www.github.com/maartenbreddels/vaex"
-# TODO: after python2 supports frops, future and futures can also be dropped
-setup_requires = ["numpy~=1.17"]
+setup_requires = ["numpy~=2.0"] # see vaex-core pyproject.toml
 install_requires_core = [
-    "numpy~=1.17",
     "aplus",
-    "tabulate>=0.8.3",
-    "dask!=2022.4.0,<2024.9",  # fingerprinting in no longer deterministic as of 2024.9.0
-    "future>=0.15.2",
-    "pyyaml",
-    "six",
+    "blake3",
     "cloudpickle",
-    "pandas>=1.0,<3",
+    "dask!=2022.4.0,<2024.9",  # fingerprinting in no longer deterministic as of 2024.9.0
+    "filelock",
+    "frozendict!=2.2.0",
+    "future",
     "nest_asyncio>=1.3.3",
+    "numpy>=1.19.3,<3", # 1.19.3 is the first version with 3.9 wheels
+    "pandas>=1.0,<3",
     "pyarrow>=5.0.0",
-    "frozendict!=2.2.0",
-    "blake3",
-    "filelock",
     "pydantic>=1.8.0",
+    "pyyaml",
     "rich",
+    "six",
+    "tabulate>=0.8.3",
 ]
 extras_require_core = {
     "all": [
-        "gcsfs>=0.6.2",
-        "s3fs",
-        "ipyvolume",
+        "aplus",
         "diskcache",
         "fsspec",
+        "gcsfs>=0.6.2",
+        "graphviz",
         "h5py",
         "httpx",
-        "aplus",
+        "ipyvolume",
         "psutil",
-        "graphviz",
+        "s3fs",
     ],
 }
 
@@ -100,7 +99,8 @@ def __str__(self):
 USE_ABSL = False
 USE_TSL = True
 
-define_macros = []
+define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')]
+np_define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')]
 if USE_ABSL:
     define_macros += [("VAEX_USE_ABSL", None)]
 if USE_TSL:
@@ -113,7 +113,7 @@ def __str__(self):
 else:
     # TODO: maybe enable these flags for non-wheel/conda builds? ["-mtune=native", "-march=native"]
     extra_compile_args = [
-        "-std=c++11",
+        "-std=c++17",
         "-O3",
         "-funroll-loops",
         "-Werror=return-type",
@@ -137,6 +137,8 @@ def __str__(self):
     [os.path.relpath(os.path.join(dirname, "src/vaexfast.cpp"))],
     include_dirs=[get_numpy_include()],
     extra_compile_args=extra_compile_args,
+    define_macros=np_define_macros,
+
 )
 extension_strings = Extension(
     "vaex.superstrings",
@@ -165,6 +167,7 @@ def __str__(self):
     ],
     extra_compile_args=extra_compile_args,
     libraries=["pcre", "pcrecpp"],
+    define_macros=np_define_macros,
 )
 extension_superutils = Extension(
     "vaex.superutils",
@@ -271,9 +274,10 @@ def __str__(self):
     if not use_skbuild
     else [],
     zip_safe=False,
-    python_requires=">=3.8,<3.13",  # 3.13 needs numpy 2.1 support ref https://github.com/vaexio/vaex/pull/2434
+    # 3.9 is the oldest python version that the new numpy build system supports
+    # ref https://github.com/scipy/oldest-supported-numpy/pull/86
+    python_requires=">=3.9,<3.13",
     classifiers=[
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",

diff --git a/packages/vaex-core/src/agg.hpp b/packages/vaex-core/src/agg.hpp
@@ -54,7 +54,12 @@ template <class IndexType = default_index_type>
 class Grid {
   public:
     using index_type = IndexType;
-    Grid(std::vector<Binner *> binners) : binners(binners), dimensions(binners.size()), shapes(binners.size()), strides(binners.size()) {
+    Grid(std::vector<Binner *> binners)
+        : binners(binners)
+        , strides(binners.size())
+        , shapes(binners.size())
+        , dimensions(binners.size())
+    {
         length1d = 1;
         for (size_t i = 0; i < dimensions; i++) {
             shapes[i] = binners[i]->shape();

diff --git a/packages/vaex-core/src/agg_first.cpp b/packages/vaex-core/src/agg_first.cpp
@@ -1,5 +1,6 @@
 #include "agg_base.hpp"
 #include "utils.hpp"
+#include <stdexcept>
 
 namespace vaex {
 
@@ -38,7 +39,8 @@ class AggFirstPrimitive : public AggregatorPrimitive<DataType, DataType, IndexTy
         }
     }
     virtual void merge(std::vector<Aggregator *> others) {
-        const bool invert = this->invert;
+        throw std::runtime_error("merge: not implemented");
+        //const bool invert = this->invert;
         // for (auto i : others) {
         //     auto other = static_cast<AggFirstPrimitive *>(i);
         //     for (size_t i = 0; i < this->grid->length1d; i++) {
@@ -105,7 +107,7 @@ class AggFirstPrimitive : public AggregatorPrimitive<DataType, DataType, IndexTy
         py::object data = numpy.attr("array")(self).attr("__getitem__")(0);
         using namespace pybind11::literals; // to bring in the `_a` literal
         auto shape = py::tuple(this->grid->shapes.size());
-        for (int i = 0; i < this->grid->shapes.size(); i++) {
+        for (size_t i = 0; i < this->grid->shapes.size(); i++) {
             shape[i] = this->grid->shapes[this->grid->shapes.size() - i - 1];
         }
         return numpy_ma.attr("array")(data, "mask"_a=mask.attr("reshape")(shape).attr("T"));
@@ -198,4 +200,4 @@ void add_agg_first_primitive(py::module &m, const py::class_<Aggregator> &base)
     template void add_agg_first_primitive<type, false>(py::module & m, const py::class_<Aggregator> &base);
 #include "create_alltypes.hpp"
 
-} // namespace vaex
+} // namespace vaex
diff --git a/packages/vaex-core/src/agg_list.cpp b/packages/vaex-core/src/agg_list.cpp
@@ -79,13 +79,11 @@ class AggListPrimitive : public AggregatorPrimitive<DataType, std::vector<typena
     }
     virtual void aggregate(int grid, int thread, default_index_type *indices1d, size_t length, uint64_t offset) {
         auto data_ptr = this->data_ptr[thread];
-        auto data_ptr2 = this->data_ptr2[thread];
+        //auto data_ptr2 = this->data_ptr2[thread];
         auto data_mask_ptr = this->data_mask_ptr[thread];
         // auto data_mask_ptr2 = this->data_mask_ptr2[thread];
         auto grid_data = &this->grid_data[grid * this->grid->length1d];
         // auto grid_data_order = &this->grid_data_order[grid * this->grid->length1d];
-        auto null_count = &this->null_count[grid * this->grid->length1d];
-        auto nan_count = &this->nan_count[grid * this->grid->length1d];
 
         if (data_ptr == nullptr) {
             throw std::runtime_error("data not set");
@@ -161,10 +159,9 @@ class AggListString : public AggBaseString<StringList64, IndexType> {
             }
 
             py::gil_scoped_release release;
-            int64_t flat_length = offset;
             for (size_t j = 0; j < this->grid->length1d; j++) {
                 // for (auto &s : grid_data[j]) {
-                for (int64_t i = 0; i < grid_data[j].length; i++) {
+                for (size_t i = 0; i < grid_data[j].length; i++) {
                     if (grid_data[j].is_null(i)) {
                         sl->push_null();
                     } else {
@@ -188,20 +185,10 @@ class AggListString : public AggBaseString<StringList64, IndexType> {
         if (string_sequence == nullptr) {
             throw std::runtime_error("string_sequence not set");
         }
-        // auto data_ptr = this->data_ptr[thread];
-        auto data_ptr2 = this->data_ptr2[thread];
-        auto data_mask_ptr = this->data_mask_ptr[thread];
-        // auto data_mask_ptr2 = this->data_mask_ptr2[thread];
         auto grid_data = &this->grid_data[grid * this->grid->length1d];
-        // auto grid_data_order = &this->grid_data_order[grid * this->grid->length1d];
-
-        auto null_count = &this->null_count[grid * this->grid->length1d];
-        auto nan_count = &this->nan_count[grid * this->grid->length1d];
-
         for (size_t j = 0; j < length; j++) {
             IndexType i = indices1d[j];
             if (!string_sequence->is_null(j + offset)) {
-                // grid_data[i].push_back(string_sequence->get(j + offset));
                 grid_data[i].push(string_sequence->view(j + offset));
             } else if (!dropnull) {
                 grid_data[i].push_null();
@@ -269,4 +256,4 @@ void add_agg_list_primitive(py::module &m, const py::class_<Aggregator> &base) {
 //     py::class_<Aggregator> &base);
 // #include "create_alltypes.hpp"
 
-} // namespace vaex
+} // namespace vaex