allenai · epwalsh · Feb 10, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -111,7 +111,7 @@ jobs:
       matrix:
         task:
           - name: Test (GPU)
-            image: olmo-core-tch251cu124
+            image: olmo-core-tch260cu124
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
@@ -120,15 +120,15 @@ jobs:
                 src/test/
 
           - name: Test checkpoint (GPU)
-            image: olmo-core-tch251cu124
+            image: olmo-core-tch260cu124
             gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
                 src/test/distributed/checkpoint*
 
           - name: Test MoE (GPU)
-            image: olmo-core-tch251cu124
-            gpus: 1
+            image: olmo-core-tch260cu124
+            gpus: 2
             run: |
               pytest -v --color=yes --durations=3 -m gpu \
                 src/test/nn/moe*
@@ -182,6 +182,7 @@ jobs:
                   preemptible: true
                 resources:
                   gpuCount: ${{ matrix.task.gpus }}
+                hostNetworking: true
                 constraints:
                   cluster:
                     # H100 clusters

diff --git a/Makefile b/Makefile
@@ -1,14 +1,14 @@
 CUDA_VERSION = "12.4"
 TORCH_CUDA_VERSION = $(shell echo $(CUDA_VERSION) | tr -d .)
-TORCH_VERSION = "2.5.1"
+TORCH_VERSION = "2.6.0"
 TORCH_VERSION_SHORT = $(shell echo $(TORCH_VERSION) | tr -d .)
 # NOTE: when upgrading the nightly version you also need to upgrade the torch version specification
 # in 'pyproject.toml' to include that nightly version.
-TORCH_NIGHTLY_VERSION = "2.6.0.dev20241209"
+TORCH_NIGHTLY_VERSION = "2.7.0.dev20250202"
 TORCH_NIGHTLY_VERSION_SHORT = $(shell echo $(TORCH_NIGHTLY_VERSION) | tr -d .)
-TORCHAO_VERSION = "0.6.1"
-MEGABLOCKS_VERSION = "megablocks[gg] @ git+https://[email protected]/epwalsh/megablocks.git@epwalsh/deps"
-FLASH_ATTN_WHEEL = https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+TORCHAO_VERSION = "0.8.0"
+GROUPED_GEMM_VERSION = "grouped_gemm @ git+https://[email protected]/tgale96/grouped_gemm.git@main"
+FLASH_ATTN_WHEEL = https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
 
 VERSION = $(shell python src/olmo_core/version.py)
 VERSION_SHORT = $(shell python src/olmo_core/version.py short)
@@ -55,7 +55,7 @@ stable-image :
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
 		--build-arg TORCH_VERSION=$(TORCH_VERSION) \
 		--build-arg FLASH_ATTN_WHEEL=$(FLASH_ATTN_WHEEL) \
-		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
+		--build-arg GROUPED_GEMM_VERSION=$(GROUPED_GEMM_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--target stable \
 		--progress plain \
@@ -70,7 +70,7 @@ nightly-image :
 		--build-arg TORCH_CUDA_VERSION=$(TORCH_CUDA_VERSION) \
 		--build-arg TORCH_VERSION=$(TORCH_VERSION) \
 		--build-arg FLASH_ATTN_WHEEL=$(FLASH_ATTN_WHEEL) \
-		--build-arg MEGABLOCKS_VERSION=$(MEGABLOCKS_VERSION) \
+		--build-arg GROUPED_GEMM_VERSION=$(GROUPED_GEMM_VERSION) \
 		--build-arg TORCHAO_VERSION=$(TORCHAO_VERSION) \
 		--build-arg TORCH_NIGHTLY_VERSION=$(TORCH_NIGHTLY_VERSION) \
 		--target nightly \

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ pip install ai2-olmo-core
 There are a number of optional dependencies that must be installed to use certain functionality as well, including:
 - [flash-attn](https://github.com/Dao-AILab/flash-attention) for flash attention and certain other fused operations.
 - [torchao](https://github.com/pytorch/ao) for float8 training.
-- [megablocks](https://github.com/databricks/megablocks) for mixture-of-experts (MoE) models.
+- [grouped_gemm](https://github.com/tgale96/grouped_gemm) for dropless mixture-of-experts (MoE) models. You may need to compile from source until [PR #21](https://github.com/tgale96/grouped_gemm/pull/21) is released (post v0.1.6).
 
 The published [Docker images](https://github.com/orgs/allenai/packages?repo_name=OLMo-core) contain all core and optional dependencies, and are regularly tested on our in-house H100 clusters.
 But there are several things to keep in mind if you intend to use these images:

diff --git a/docs/source/overview/installation.rst b/docs/source/overview/installation.rst
@@ -12,4 +12,4 @@ There are a number of optional dependencies that must be installed to use certai
 
 - `flash-attn <https://github.com/Dao-AILab/flash-attention>`_ for flash attention and certain other fused operations.
 - `torchao <https://github.com/pytorch/ao>`_ for float8 training (see :mod:`olmo_core.float8`).
-- `megablocks <https://github.com/databricks/megablocks>`_ for mixture-of-experts (MoE) models (see :mod:`olmo_core.nn.moe`).
+- `grouped_gemm <https://github.com/tgale96/grouped_gemm>`_ for dropless mixture-of-experts (MoE) models (see :mod:`olmo_core.nn.moe`).
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "omegaconf",
     "safetensors",
     "importlib_resources",
-    "ai2-olmo-eval==0.5.0",
+    "ai2-olmo-eval==0.6.1",
 ]
 
 [project.urls]
@@ -169,4 +169,5 @@ filterwarnings = [
     'ignore::DeprecationWarning:pkg_resources',
     'ignore::DeprecationWarning:google\.rpc',
     'ignore::FutureWarning:torch\.distributed\.checkpoint\.default_planner',
+    'ignore::UserWarning:torch\.distributed\.checkpoint\.state_dict_saver',
 ]
diff --git a/src/Dockerfile b/src/Dockerfile
@@ -1,7 +1,7 @@
 # NOTE: make sure CUDA_VERSION and TORCH_CUDA_VERSION always match, except for punctuation
 ARG CUDA_VERSION="12.4"
 ARG TORCH_CUDA_VERSION="124"
-ARG TORCH_VERSION="2.5.1"
+ARG TORCH_VERSION="2.6.0
 
 #########################################################################
 # Build image
@@ -24,22 +24,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 # Install/upgrade Python build dependencies.
 RUN pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja
 
-# Build megablocks, grouped-gemm, stanford-stk
+# Build grouped-gemm.
+# NOTE: right now we need to build with CUTLASS so we can pass batch sizes on GPU.
+# See https://github.com/tgale96/grouped_gemm/pull/21
 ENV TORCH_CUDA_ARCH_LIST="8.0 9.0"
 ENV GROUPED_GEMM_CUTLASS="1"
-ARG MEGABLOCKS_VERSION="megablocks[gg] @ git+https://[email protected]/epwalsh/megablocks.git@epwalsh/deps"
-RUN pip wheel --no-build-isolation --no-cache-dir "${MEGABLOCKS_VERSION}"
+ARG GROUPED_GEMM_VERSION="grouped_gemm @ git+https://[email protected]/tgale96/grouped_gemm.git@main"
+RUN pip wheel --no-build-isolation --no-cache-dir "${GROUPED_GEMM_VERSION}"
 
 # Build flash-attn.
-ARG FLASH_ATTN_WHEEL=https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+ARG FLASH_ATTN_WHEEL=https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
 RUN wget ${FLASH_ATTN_WHEEL}
 
 # Only keep the target wheels and dependencies with CUDA extensions.
-RUN echo "Built wheels:" \
-    && ls -lh . \
-    && ls -1 | grep -Ev 'megablocks|grouped_gemm|stanford_stk|flash_attn' | xargs rm \
-    && echo "Final wheels:" \
-    && ls -lh .
+RUN echo "Built wheels:" && ls -lh .
+    # && ls -1 | grep -Ev 'grouped_gemm|flash_attn' | xargs rm \
+    # && echo "Final wheels:" \
+    # && ls -lh .
 
 #########################################################################
 # Stable image
@@ -73,7 +74,7 @@ RUN pip install --upgrade --no-cache-dir pip wheel packaging
 
 # Install torchao.
 ARG TORCH_CUDA_VERSION
-ARG TORCHAO_VERSION="0.6.1"
+ARG TORCHAO_VERSION="0.8.0"
 RUN pip install --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} \
     torchao==${TORCHAO_VERSION}
@@ -100,7 +101,7 @@ WORKDIR /app/olmo-core
 FROM stable as nightly
 
 ARG TORCH_CUDA_VERSION
-ARG TORCH_NIGHTLY_VERSION="2.6.0.dev20241209"
+ARG TORCH_NIGHTLY_VERSION="2.7.0.dev20250202"
 RUN pip install --no-cache-dir --pre \
     --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} \
     torch==${TORCH_NIGHTLY_VERSION}+cu${TORCH_CUDA_VERSION}
diff --git a/src/examples/llama/train.py b/src/examples/llama/train.py
@@ -57,6 +57,7 @@ def build_config(run_name: str, overrides: List[str]) -> ExperimentConfig:
     tokenizer_config = TokenizerConfig.gpt2()
 
     model_config = TransformerConfig.llama2_271M(
+        #  model_config = TransformerConfig.smallmoe(
         vocab_size=tokenizer_config.padded_vocab_size(),  # a little bigger than actual vocab size to make it a multiple of 128
     )
 

diff --git a/src/olmo_core/data/utils.py b/src/olmo_core/data/utils.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 
 from olmo_core.aliases import PathOrStr
 from olmo_core.io import add_cached_path_clients, get_bytes_range, is_url, resource_path
@@ -467,4 +468,5 @@ def get_labels(batch: Dict[str, Any], label_ignore_index: int = -100) -> torch.T
         labels.masked_fill_(attention_mask == 0.0, label_ignore_index)
     if instance_mask is not None:
         labels.masked_fill_(~instance_mask.unsqueeze(-1), value=label_ignore_index)
-    return labels[..., 1:].contiguous()
+    # Shift and pad.
+    return F.pad(labels[..., 1:], (0, 1, 0, 0), value=label_ignore_index)