Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b00e4c5
refactor: generalize dataset indexing from language-based to dataset_…
federetyk Jan 15, 2026
17b1897
fix: solve issues in example files
federetyk Jan 15, 2026
e16f8dd
fix: add language field to MetricsResult for proper per-language aggr…
federetyk Jan 15, 2026
e254bc2
style: update docstrings to comply with NumPy style
federetyk Jan 16, 2026
40810c2
chore: merge upstream changes (v0.3.0, task renames, test refactor)
federetyk Jan 16, 2026
71d6d97
refactor: rename language_results to datasetid_results for consistenc…
federetyk Feb 19, 2026
647b070
docs: clarify get_dataset_language docstring on purpose and when to o…
federetyk Feb 20, 2026
1b726ee
Merge branch 'techwolf-ai:main' into refactor/generalize-dataset-inde…
federetyk Feb 21, 2026
3a9514d
refactor: migrate freelancer project matching tasks to load_dataset API
federetyk Feb 21, 2026
879dece
feat: add cross-lingual aggregation modes for per-language metrics
federetyk Feb 22, 2026
e3ccb24
Merge branch 'techwolf-ai:main' into refactor/generalize-dataset-inde…
federetyk Feb 23, 2026
72b8e40
test: make it explicit that the dataset key "en" comes from the Langu…
federetyk Feb 23, 2026
033db0f
test: fix lexical baselines regression test to use dataset_id parameter
federetyk Feb 23, 2026
724b0e0
feat: add lazy execution filtering and ExecutionMode enum
federetyk Feb 23, 2026
f3c5e19
test: fix tolerance for regression test to work well on diverse envir…
federetyk Feb 23, 2026
7d0b8b5
refactor: make language_aggregation_mode a non-optional parameter in …
federetyk Feb 24, 2026
4825486
refactor: migrate freelancer task to dataset_id-based language mapping
federetyk Feb 24, 2026
bbe0ac3
refactor: use language-grouped averaging in per-task aggregation
federetyk Feb 25, 2026
e1dfd9d
docs: add benchmark example scripts for each aggregation mode
federetyk Feb 25, 2026
e4a6bce
fix: remove from example the dataset that uses ESCO 1.0.5 but defines…
federetyk Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ class MyCustomRankingTask(RankingTask):
"""Override default metrics if needed"""
return ["map", "mrr", "recall@5", "recall@10"]

def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset:
def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
"""
Load dataset for a specific language and split.
Load dataset for a specific dataset ID and split.

Returns:
RankingDataset with query_texts, target_indices, and target_space
"""
Expand All @@ -196,12 +196,12 @@ class MyCustomRankingTask(RankingTask):
[0, 2], # Software Engineer -> Python, SQL
[0, 1], # Data Scientist -> Python, Machine Learning
]

return RankingDataset(
query_texts=query_texts,
target_indices=target_indices,
target_space=target_space,
language=language,
dataset_id=dataset_id,
)
```

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Feel free to make a PR to add your models & tasks to the official package! See [

### Checkpointing & Resuming

WorkRB automatically saves result checkpoints after each task completion in a specific language.
WorkRB automatically saves result checkpoints after each dataset evaluation within a task.

**Automatic Resuming** - Simply rerun with the same `output_folder`:

Expand Down
3 changes: 3 additions & 0 deletions examples/custom_model_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import torch
from sentence_transformers import SentenceTransformer

import workrb
from workrb.models.base import ModelInterface
from workrb.registry import register_model
from workrb.types import ModelInputType
Expand Down Expand Up @@ -47,10 +48,12 @@ def __init__(
self.encoder.to(device)
self.encoder.eval()

@property
def name(self) -> str:
"""Return the unique name of this model."""
return f"MyCustomModel-{self.base_model_name.split('/')[-1]}"

@property
def description(self) -> str:
"""Return the description of this model."""
return "A custom model that demonstrates WorkRB extensibility"
Expand Down
7 changes: 4 additions & 3 deletions examples/custom_task_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
and implement the required abstract methods.
"""

import workrb
from workrb.registry import register_task
from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language
from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTaskGroup
Expand Down Expand Up @@ -78,14 +79,14 @@ def supported_target_languages(self) -> list[Language]:
"""Supported target languages are English."""
return [Language.EN]

def load_monolingual_data(self, language: Language, split: DatasetSplit) -> RankingDataset:
def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset:
"""
Load data for evaluation.

This method must return a RankingDataset.

Args:
language: Language code (e.g., "en", "de", "fr")
dataset_id: Dataset identifier (e.g., "en", "de", "fr" for language-based tasks)
split: Data split ("test", "validation", "train")

Returns
Expand Down Expand Up @@ -121,7 +122,7 @@ def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Rank
query_texts=queries,
target_indices=labels,
target_space=targets,
language=language,
dataset_id=dataset_id,
)

# Note: The evaluate() method is inherited from RankingTask and doesn't need
Expand Down
2 changes: 2 additions & 0 deletions examples/run_multiple_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Reproduce benchmark results.
"""

import workrb

if __name__ == "__main__":
# 1. Setup model and tasks
models = [
Expand Down
3 changes: 3 additions & 0 deletions src/workrb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
from workrb.registry import list_available_tasks
from workrb.results import load_results
from workrb.run import evaluate, evaluate_multiple_models, get_tasks_overview
from workrb.types import ExecutionMode, LanguageAggregationMode

__all__ = [
"ExecutionMode",
"LanguageAggregationMode",
"data",
"evaluate",
"evaluate_multiple_models",
Expand Down
12 changes: 6 additions & 6 deletions src/workrb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,24 +205,24 @@ def get_pending_work(
self,
results: BenchmarkResults | None,
tasks: Sequence[Task],
) -> list[tuple]:
) -> list[tuple[Task, str]]:
"""Determine what work still needs to be done.

Work is defined as a (task, language) combination that is not completed.
Work is defined as a (task, dataset_id) combination that is not completed.
"""
pending_work = []
for task in tasks:
for language in task.languages:
# Successful completed (task, language) combination
for dataset_id in task.dataset_ids:
# Successful completed (task, dataset_id) combination
if (
results is not None
and task.name in results.task_results
and language in results.task_results[task.name].language_results
and dataset_id in results.task_results[task.name].datasetid_results
):
continue

# Add to pending work
pending_work.append((task, language))
pending_work.append((task, dataset_id))

return pending_work

Expand Down
117 changes: 102 additions & 15 deletions src/workrb/results.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import logging
import pprint
from collections import defaultdict
from typing import Any
Expand All @@ -8,6 +9,10 @@
from pydantic import BaseModel, Field
from scipy import stats

from workrb.types import LanguageAggregationMode, get_language_grouping_key

logger = logging.getLogger(__name__)


class TaskResultMetadata(BaseModel):
"""Metadata for a task result."""
Expand All @@ -22,20 +27,28 @@ class TaskResultMetadata(BaseModel):
class MetricsResult(BaseModel):
"""Metric results for a single evaluation run.

In the becnhmark, this is a single evaluation run for a single language.
In the benchmark, this is a single evaluation run for a single dataset.
"""

evaluation_time: float = Field(ge=0)
metrics_dict: dict[str, Any] = Field(default_factory=dict)
""" Dictionary of metric names to their computed values. """
input_languages: list[str] = Field(
default_factory=list,
description="Input language codes for this dataset (e.g. query languages).",
)
output_languages: list[str] = Field(
default_factory=list,
description="Output language codes for this dataset (e.g. target languages).",
)


class TaskResults(BaseModel):
"""Results for a task."""

metadata: TaskResultMetadata
language_results: dict[str, MetricsResult] # language -> results
""" Dictionary of language codes to their computed results. """
datasetid_results: dict[str, MetricsResult] # dataset_id -> results
"""Dictionary of dataset IDs to their computed results."""


class BenchmarkMetadata(BaseModel):
Expand Down Expand Up @@ -95,11 +108,23 @@ def __str__(self) -> str:

def get_num_evaluation_results(self) -> int:
"""Get the total number of evaluation results."""
return sum(len(task.language_results) for task in self.task_results.values())
return sum(len(task.datasetid_results) for task in self.task_results.values())

def get_summary_metrics(self, aggregations: tuple = ("mean", "ci_margin")) -> dict[str, float]:
def get_summary_metrics(
self,
aggregations: tuple = ("mean", "ci_margin"),
language_aggregation_mode: LanguageAggregationMode = LanguageAggregationMode.MONOLINGUAL_ONLY,
) -> dict[str, float]:
"""
Get summary metrics for the benchmark results.

Parameters
----------
aggregations : tuple
Statistics to compute (e.g. ``"mean"``, ``"ci_margin"``).
language_aggregation_mode : LanguageAggregationMode
How to determine the grouping language for per-language aggregation.
Defaults to ``MONOLINGUAL_ONLY``.
"""
mean_per_task = self._aggregate_per_task(
aggregations=aggregations,
Expand All @@ -115,6 +140,7 @@ def get_summary_metrics(self, aggregations: tuple = ("mean", "ci_margin")) -> di
)
mean_per_language = self._aggregate_per_language(
aggregations=aggregations,
aggregation_mode=language_aggregation_mode,
)

combined = {
Expand All @@ -135,7 +161,7 @@ def _aggregate_per_task(
# Collect metric values per task
raw_results = defaultdict(list)
for task_name, task_result in self.task_results.items():
for lang_metrics_result in task_result.language_results.values():
for lang_metrics_result in task_result.datasetid_results.values():
for metric_name, metric_value in lang_metrics_result.metrics_dict.items():
raw_results[(task_name, metric_name)].append(metric_value)

Expand Down Expand Up @@ -285,22 +311,80 @@ def _aggregate_benchmark(
metric_results[tag] = stats[agg]
return metric_results

@staticmethod
def _get_language_grouping_key(
metrics_result: "MetricsResult",
mode: LanguageAggregationMode,
) -> str | None:
"""Determine the grouping language for a dataset result.

Delegates to :func:`workrb.types.get_language_grouping_key`.

Returns ``None`` when the dataset is incompatible with the requested
mode, so that the caller can skip it during aggregation.

Parameters
----------
metrics_result : MetricsResult
The metrics result to extract a language key from.
mode : LanguageAggregationMode
The aggregation mode controlling how the language key is derived.

Returns
-------
str or None
Language code to group by, or ``None`` if the dataset is
incompatible with the mode.
"""
return get_language_grouping_key(
metrics_result.input_languages,
metrics_result.output_languages,
mode,
)

def _aggregate_per_language(
self,
tag_name: str = "mean_per_language",
aggregations: tuple = ("mean", "stderr", "ci_margin"),
aggregation_mode: LanguageAggregationMode = LanguageAggregationMode.MONOLINGUAL_ONLY,
) -> dict[ResultTagString, float]:
"""Aggregate results per language.

Collects language-specific results over all tasks, and aggregates all availble results.
Results may be imbalanced if tasks support different languages.
Groups dataset results by language across all tasks and computes
aggregate statistics. The ``aggregation_mode`` parameter controls how
the grouping language is determined for each dataset.

Parameters
----------
tag_name : str
Prefix for the result tag strings.
aggregations : tuple
Statistics to compute (e.g. ``"mean"``, ``"stderr"``).
aggregation_mode : LanguageAggregationMode
How to determine the grouping language for each dataset result.
Defaults to ``MONOLINGUAL_ONLY`` (backward compatible for benchmarks
with only monolingual datasets).
Datasets incompatible with the chosen mode are skipped with a warning.
"""
# Collect metric values per task
# Collect metric values per language
raw_results = defaultdict(list)
for task_result in self.task_results.values():
for language, metrics_result in task_result.language_results.items():
for task_name, task_result in self.task_results.items():
for dataset_id, metrics_result in task_result.datasetid_results.items():
language_key = self._get_language_grouping_key(metrics_result, aggregation_mode)
if language_key is None:
logger.warning(
"Skipping dataset '%s' of task '%s' in per-language aggregation: "
"incompatible with mode '%s' "
"(input_languages=%s, output_languages=%s).",
dataset_id,
task_name,
aggregation_mode.value,
metrics_result.input_languages,
metrics_result.output_languages,
)
continue
for metric_name, metric_value in metrics_result.metrics_dict.items():
raw_results[(language, metric_name)].append(metric_value)
raw_results[(language_key, metric_name)].append(metric_value)

# Compute stats
results = {}
Expand All @@ -309,7 +393,10 @@ def _aggregate_per_language(
for agg in aggregations:
assert agg in stats, f"Aggregation {agg} not found in stats: {stats.keys()}"
tag = ResultTagString(
name=tag_name, metric_name=metric_name, aggregation=agg, grouping_name=language
name=tag_name,
metric_name=metric_name,
aggregation=agg,
grouping_name=language,
)
results[tag] = stats[agg]
return results
Expand Down Expand Up @@ -340,7 +427,7 @@ def _get_flat_dataframe(self) -> pd.DataFrame:
"""Get flat dataframe of the benchmark results with each metric value as a separate row."""
data = []
for task_name, task_result in self.task_results.items():
for language, metrics_result in task_result.language_results.items():
for dataset_id, metrics_result in task_result.datasetid_results.items():
for metric_name, metric_value in metrics_result.metrics_dict.items():
data.append(
{
Expand All @@ -349,7 +436,7 @@ def _get_flat_dataframe(self) -> pd.DataFrame:
"task_type": str(task_result.metadata.task_type),
# "task_label_type": str(task_result.metadata.label_type),
# "task_split": str(task_result.metadata.split),
"task_language": str(language),
"dataset_id": str(dataset_id),
"metric_name": str(metric_name),
"metric_value": float(metric_value),
}
Expand Down
Loading