From efb44a6c6aecb2b5afdd2b152416d854a393b8b8 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Wed, 24 Jul 2024 11:52:58 -0400 Subject: [PATCH] dynamically generate version from tags Use setuptools-scm to get a version from the tag when released instead of looking for the contents of a file. Generate a version.py file to replace the static value in __init__.py. --- .gitignore | 1 + pyproject.toml | 7 +- sacrebleu/__init__.py | 63 ++++++++++----- sacrebleu/metrics/base.py | 158 +++++++++++++++++++++----------------- 4 files changed, 138 insertions(+), 91 deletions(-) diff --git a/.gitignore b/.gitignore index e2d4f533..a71c6217 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ sacrebleu.egg-info *~ .DS_Store .idea/ +sacrebleu/version.py diff --git a/pyproject.toml b/pyproject.toml index 69cea21c..e8d06958 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [build-system] -requires = ["setuptools>=64"] +requires = ["setuptools>=64", "setuptools_scm>=8"] build-backend = "setuptools.build_meta" [project] name = "sacrebleu" -version = "2.4.1" +dynamic = ["version"] authors = [{ name = "Matt Post", email = "post@cs.jhu.edu" }] maintainers = [{ name = "Matt Post", email = "post@cs.jhu.edu" }] description = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores" @@ -72,3 +72,6 @@ Repository = "https://github.com/mjpost/sacrebleu" [tool.setuptools.package-data] sacrebleu = ["py.typed"] + +[tool.setuptools_scm] +version_file = "sacrebleu/version.py" diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py index c75839a1..111f284d 100644 --- a/sacrebleu/__init__.py +++ b/sacrebleu/__init__.py @@ -14,30 +14,53 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '2.4.1' -__description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores' +__description__ = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores" -from .utils import smart_open, SACREBLEU_DIR, download_test_set -from .utils import get_source_file, get_reference_files -from .utils import get_available_testsets, get_langpairs_for_testset -from .metrics.helpers import extract_word_ngrams, extract_char_ngrams +# Backward compatibility functions for old style API access (<= 1.4.10) +from .compat import ( + corpus_bleu, + corpus_chrf, + corpus_ter, + raw_corpus_bleu, + sentence_bleu, + sentence_chrf, + sentence_ter, +) from .dataset import DATASETS from .metrics import BLEU, CHRF, TER - -# Backward compatibility functions for old style API access (<= 1.4.10) -from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu -from .compat import corpus_chrf, sentence_chrf -from .compat import corpus_ter, sentence_ter +from .metrics.helpers import extract_char_ngrams, extract_word_ngrams +from .utils import ( + SACREBLEU_DIR, + download_test_set, + get_available_testsets, + get_langpairs_for_testset, + get_reference_files, + get_source_file, + smart_open, +) +from .version import __version__ __all__ = [ - 'smart_open', 'SACREBLEU_DIR', 'download_test_set', - 'get_source_file', 'get_reference_files', - 'get_available_testsets', 'get_langpairs_for_testset', - 'extract_word_ngrams', 'extract_char_ngrams', - 'DATASETS', - 'BLEU', 'CHRF', 'TER', - 'corpus_bleu', 'raw_corpus_bleu', 'sentence_bleu', - 'corpus_chrf', 'sentence_chrf', - 'corpus_ter', 'sentence_ter' + "smart_open", + "SACREBLEU_DIR", + "download_test_set", + "get_source_file", + "get_reference_files", + "get_available_testsets", + "get_langpairs_for_testset", + "extract_word_ngrams", + "extract_char_ngrams", + "DATASETS", + "BLEU", + "CHRF", + "TER", + "corpus_bleu", + "raw_corpus_bleu", + "sentence_bleu", + "corpus_chrf", + "sentence_chrf", + "corpus_ter", + "sentence_ter", + "__version__", ] diff --git a/sacrebleu/metrics/base.py b/sacrebleu/metrics/base.py index 93fb1081..b0141247 100644 --- a/sacrebleu/metrics/base.py +++ b/sacrebleu/metrics/base.py @@ -8,12 +8,12 @@ import json import logging import statistics -from typing import List, Sequence, Any, Optional, Dict from abc import ABCMeta, abstractmethod +from typing import Any, Dict, List, Optional, Sequence -from .. import __version__ +from ..version import __version__ -sacrelogger = logging.getLogger('sacrebleu') +sacrelogger = logging.getLogger("sacrebleu") class Score: @@ -22,6 +22,7 @@ class Score: :param name: The name of the underlying metric. :param score: A floating point number for the final metric. """ + def __init__(self, name: str, score: float): """`Score` initializer.""" self.name = name @@ -32,10 +33,15 @@ def __init__(self, name: str, score: float): self._ci = -1.0 # More info can be added right after the score - self._verbose = '' - - def format(self, width: int = 2, score_only: bool = False, - signature: str = '', is_json: bool = False) -> str: + self._verbose = "" + + def format( + self, + width: int = 2, + score_only: bool = False, + signature: str = "", + is_json: bool = False, + ) -> str: """Returns a pretty representation of the score. :param width: Floating point decimal precision width. :param score_only: If `True`, and the format is not `json`, @@ -46,43 +52,43 @@ def format(self, width: int = 2, score_only: bool = False, :return: A plain or JSON-formatted string representation. """ d = { - 'name': self.name, - 'score': float(f'{self.score:.{width}f}'), - 'signature': signature, + "name": self.name, + "score": float(f"{self.score:.{width}f}"), + "signature": signature, } - sc = f'{self.score:.{width}f}' + sc = f"{self.score:.{width}f}" if self._mean > 0: - confidence_mean = f'{self._mean:.{width}f}' - confidence_var = f'{self._ci:.{width}f}' - confidence_str = f'μ = {confidence_mean} ± {confidence_var}' + confidence_mean = f"{self._mean:.{width}f}" + confidence_var = f"{self._ci:.{width}f}" + confidence_str = f"μ = {confidence_mean} ± {confidence_var}" - sc += f' ({confidence_str})' + sc += f" ({confidence_str})" if is_json: - d['confidence_mean'] = float(confidence_mean) - d['confidence_var'] = float(confidence_var) - d['confidence'] = confidence_str + d["confidence_mean"] = float(confidence_mean) + d["confidence_var"] = float(confidence_var) + d["confidence"] = confidence_str # Construct full score line full_score = f"{self.name}|{signature}" if signature else self.name full_score = f"{full_score} = {sc}" if self._verbose: - full_score += f' {self._verbose}' - d['verbose_score'] = self._verbose + full_score += f" {self._verbose}" + d["verbose_score"] = self._verbose if score_only: return sc if is_json: - for param in signature.split('|'): - key, value = param.split(':') + for param in signature.split("|"): + key, value = param.split(":") d[key] = value return json.dumps(d, indent=1, ensure_ascii=False) return full_score - def estimate_ci(self, scores: List['Score']): + def estimate_ci(self, scores: List["Score"]): """Takes a list of scores and stores mean, stdev and 95% confidence interval around the mean. @@ -110,42 +116,44 @@ class Signature: :param args: key-value dictionary passed from the actual metric instance. """ + def __init__(self, args: dict): """`Signature` initializer.""" # Global items that are shared across all metrics self._abbr = { - 'version': 'v', - 'nrefs': '#', - 'test': 't', - 'lang': 'l', - 'subset': 'S', - 'origlang': 'o', - 'bs': 'bs', # Bootstrap resampling trials - 'ar': 'ar', # Approximate randomization trials - 'seed': 'rs', # RNG's seed + "version": "v", + "nrefs": "#", + "test": "t", + "lang": "l", + "subset": "S", + "origlang": "o", + "bs": "bs", # Bootstrap resampling trials + "ar": "ar", # Approximate randomization trials + "seed": "rs", # RNG's seed } - if 'num_refs' not in args: + if "num_refs" not in args: raise ValueError( - 'Number of references unknown, please evaluate the metric first.') + "Number of references unknown, please evaluate the metric first." + ) - num_refs = args['num_refs'] + num_refs = args["num_refs"] if num_refs == -1: # Detect variable number of refs - num_refs = 'var' + num_refs = "var" # Global items that are shared across all metrics # None's will be ignored self.info = { - 'version': __version__, - 'nrefs': num_refs, - 'bs': args.get('n_bootstrap', None), - 'ar': None, - 'seed': args.get('seed', None), - 'test': args.get('test_set', None), - 'lang': args.get('langpair', None), - 'origlang': args.get('origlang', None), - 'subset': args.get('subset', None), + "version": __version__, + "nrefs": num_refs, + "bs": args.get("n_bootstrap", None), + "ar": None, + "seed": args.get("seed", None), + "test": args.get("test_set", None), + "lang": args.get("langpair", None), + "origlang": args.get("origlang", None), + "subset": args.get("subset", None), } def format(self, short: bool = False) -> str: @@ -157,17 +165,17 @@ def format(self, short: bool = False) -> str: pairs = [] keys = list(self.info.keys()) # keep version always at end - keys.remove('version') - for name in keys + ['version']: + keys.remove("version") + for name in keys + ["version"]: value = self.info[name] if value is not None: if isinstance(value, bool): # Replace True/False with yes/no - value = 'yes' if value else 'no' + value = "yes" if value else "no" final_name = self._abbr[name] if short else name - pairs.append(f'{final_name}:{value}') + pairs.append(f"{final_name}:{value}") - return '|'.join(pairs) + return "|".join(pairs) def update(self, key: str, value: Any): """Add a new item or update an existing one. @@ -217,17 +225,18 @@ def _check_sentence_score_args(self, hyp: str, refs: Sequence[str]): err_msg = None if not isinstance(hyp, str): - err_msg = 'The argument `hyp` should be a string.' + err_msg = "The argument `hyp` should be a string." elif isinstance(refs, str) or not isinstance(refs, Sequence): - err_msg = 'The argument `refs` should be a sequence of strings.' + err_msg = "The argument `refs` should be a sequence of strings." elif not isinstance(refs[0], str) and refs[0] is not None: - err_msg = 'Each element of `refs` should be a string.' + err_msg = "Each element of `refs` should be a string." if err_msg: - raise TypeError(f'{prefix}: {err_msg}') + raise TypeError(f"{prefix}: {err_msg}") - def _check_corpus_score_args(self, hyps: Sequence[str], - refs: Optional[Sequence[Sequence[str]]]): + def _check_corpus_score_args( + self, hyps: Sequence[str], refs: Optional[Sequence[Sequence[str]]] + ): """Performs sanity checks on `corpus_score` method's arguments. :param hypses: A sequence of hypothesis strings. @@ -242,7 +251,7 @@ def _check_corpus_score_args(self, hyps: Sequence[str], if not isinstance(hyps, Sequence): err_msg = "`hyps` should be a sequence of strings." elif not isinstance(hyps[0], str): - err_msg = 'Each element of `hyps` should be a string.' + err_msg = "Each element of `hyps` should be a string." elif any(line is None for line in hyps): err_msg = "Undefined line in hypotheses stream!" @@ -255,7 +264,7 @@ def _check_corpus_score_args(self, hyps: Sequence[str], err_msg = "`refs` should be a sequence of sequence of strings." if err_msg: - raise TypeError(f'{prefix}: {err_msg}') + raise TypeError(f"{prefix}: {err_msg}") @abstractmethod def _aggregate_and_compute(self, stats: List[List[Any]]) -> Any: @@ -296,7 +305,9 @@ def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]: pass @abstractmethod - def _compute_segment_statistics(self, hypothesis: str, ref_kwargs: Dict) -> List[Any]: + def _compute_segment_statistics( + self, hypothesis: str, ref_kwargs: Dict + ) -> List[Any]: """Given a (pre-processed) hypothesis sentence and already computed reference info, returns the best match statistics across the references. The return type is usually a List of ints or floats. @@ -345,8 +356,9 @@ def _cache_references(self, references: Sequence[Sequence[str]]) -> List[Any]: return ref_cache - def _extract_corpus_statistics(self, hypotheses: Sequence[str], - references: Optional[Sequence[Sequence[str]]]) -> Any: + def _extract_corpus_statistics( + self, hypotheses: Sequence[str], references: Optional[Sequence[Sequence[str]]] + ) -> Any: """Reads the corpus and returns sentence-level match statistics for faster re-computations esp. during statistical tests. @@ -363,14 +375,14 @@ def _extract_corpus_statistics(self, hypotheses: Sequence[str], elif self._ref_cache: ref_cache = self._ref_cache else: - raise RuntimeError('No references provided and the cache is empty.') + raise RuntimeError("No references provided and the cache is empty.") stats = [] tok_count = 0 for hyp, ref_kwargs in zip(hypotheses, ref_cache): # Check for already-tokenized input problem (only for BLEU) - if not self._force and hyp.endswith(' .'): + if not self._force and hyp.endswith(" ."): tok_count += 1 hyp = self._preprocess_segment(hyp) @@ -380,8 +392,12 @@ def _extract_corpus_statistics(self, hypotheses: Sequence[str], if tok_count >= 100: sacrelogger.warning("That's 100 lines that end in a tokenized period ('.')") - sacrelogger.warning("It looks like you forgot to detokenize your test data, which may hurt your score.") - sacrelogger.warning("If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.") + sacrelogger.warning( + "It looks like you forgot to detokenize your test data, which may hurt your score." + ) + sacrelogger.warning( + "If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter." + ) return stats @@ -395,12 +411,16 @@ def sentence_score(self, hypothesis: str, references: Sequence[str]) -> Any: self._check_sentence_score_args(hypothesis, references) stats = self._extract_corpus_statistics( - [hypothesis], [[refs] for refs in references]) + [hypothesis], [[refs] for refs in references] + ) return self._aggregate_and_compute(stats) - def corpus_score(self, hypotheses: Sequence[str], - references: Optional[Sequence[Sequence[str]]], - n_bootstrap: int = 1) -> Any: + def corpus_score( + self, + hypotheses: Sequence[str], + references: Optional[Sequence[Sequence[str]]], + n_bootstrap: int = 1, + ) -> Any: """Compute the metric for a corpus against a single (or multiple) reference(s). :param hypotheses: A sequence of hypothesis strings.