From efb44a6c6aecb2b5afdd2b152416d854a393b8b8 Mon Sep 17 00:00:00 2001
From: Doug Hellmann <dhellmann@redhat.com>
Date: Wed, 24 Jul 2024 11:52:58 -0400
Subject: [PATCH] dynamically generate version from tags

Use setuptools-scm to get a version from the tag
when released instead of looking for the contents
of a file. Generate a version.py file to replace
the static value in __init__.py.
---
 .gitignore                |   1 +
 pyproject.toml            |   7 +-
 sacrebleu/__init__.py     |  63 ++++++++++-----
 sacrebleu/metrics/base.py | 158 +++++++++++++++++++++-----------------
 4 files changed, 138 insertions(+), 91 deletions(-)

diff --git a/.gitignore b/.gitignore
index e2d4f533..a71c6217 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ sacrebleu.egg-info
 *~
 .DS_Store
 .idea/
+sacrebleu/version.py
diff --git a/pyproject.toml b/pyproject.toml
index 69cea21c..e8d06958 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,10 @@
 [build-system]
-requires = ["setuptools>=64"]
+requires = ["setuptools>=64", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "sacrebleu"
-version = "2.4.1"
+dynamic = ["version"]
 authors = [{ name = "Matt Post", email = "post@cs.jhu.edu" }]
 maintainers = [{ name = "Matt Post", email = "post@cs.jhu.edu" }]
 description = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores"
@@ -72,3 +72,6 @@ Repository = "https://github.com/mjpost/sacrebleu"
 
 [tool.setuptools.package-data]
 sacrebleu = ["py.typed"]
+
+[tool.setuptools_scm]
+version_file = "sacrebleu/version.py"
diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py
index c75839a1..111f284d 100644
--- a/sacrebleu/__init__.py
+++ b/sacrebleu/__init__.py
@@ -14,30 +14,53 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '2.4.1'
-__description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores'
+__description__ = "Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores"
 
 
-from .utils import smart_open, SACREBLEU_DIR, download_test_set
-from .utils import get_source_file, get_reference_files
-from .utils import get_available_testsets, get_langpairs_for_testset
-from .metrics.helpers import extract_word_ngrams, extract_char_ngrams
+# Backward compatibility functions for old style API access (<= 1.4.10)
+from .compat import (
+    corpus_bleu,
+    corpus_chrf,
+    corpus_ter,
+    raw_corpus_bleu,
+    sentence_bleu,
+    sentence_chrf,
+    sentence_ter,
+)
 from .dataset import DATASETS
 from .metrics import BLEU, CHRF, TER
-
-# Backward compatibility functions for old style API access (<= 1.4.10)
-from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu
-from .compat import corpus_chrf, sentence_chrf
-from .compat import corpus_ter, sentence_ter
+from .metrics.helpers import extract_char_ngrams, extract_word_ngrams
+from .utils import (
+    SACREBLEU_DIR,
+    download_test_set,
+    get_available_testsets,
+    get_langpairs_for_testset,
+    get_reference_files,
+    get_source_file,
+    smart_open,
+)
+from .version import __version__
 
 __all__ = [
-    'smart_open', 'SACREBLEU_DIR', 'download_test_set',
-    'get_source_file', 'get_reference_files',
-    'get_available_testsets', 'get_langpairs_for_testset',
-    'extract_word_ngrams', 'extract_char_ngrams',
-    'DATASETS',
-    'BLEU', 'CHRF', 'TER',
-    'corpus_bleu', 'raw_corpus_bleu', 'sentence_bleu',
-    'corpus_chrf', 'sentence_chrf',
-    'corpus_ter', 'sentence_ter'
+    "smart_open",
+    "SACREBLEU_DIR",
+    "download_test_set",
+    "get_source_file",
+    "get_reference_files",
+    "get_available_testsets",
+    "get_langpairs_for_testset",
+    "extract_word_ngrams",
+    "extract_char_ngrams",
+    "DATASETS",
+    "BLEU",
+    "CHRF",
+    "TER",
+    "corpus_bleu",
+    "raw_corpus_bleu",
+    "sentence_bleu",
+    "corpus_chrf",
+    "sentence_chrf",
+    "corpus_ter",
+    "sentence_ter",
+    "__version__",
 ]
diff --git a/sacrebleu/metrics/base.py b/sacrebleu/metrics/base.py
index 93fb1081..b0141247 100644
--- a/sacrebleu/metrics/base.py
+++ b/sacrebleu/metrics/base.py
@@ -8,12 +8,12 @@
 import json
 import logging
 import statistics
-from typing import List, Sequence, Any, Optional, Dict
 from abc import ABCMeta, abstractmethod
+from typing import Any, Dict, List, Optional, Sequence
 
-from .. import __version__
+from ..version import __version__
 
-sacrelogger = logging.getLogger('sacrebleu')
+sacrelogger = logging.getLogger("sacrebleu")
 
 
 class Score:
@@ -22,6 +22,7 @@ class Score:
     :param name: The name of the underlying metric.
     :param score: A floating point number for the final metric.
     """
+
     def __init__(self, name: str, score: float):
         """`Score` initializer."""
         self.name = name
@@ -32,10 +33,15 @@ def __init__(self, name: str, score: float):
         self._ci = -1.0
 
         # More info can be added right after the score
-        self._verbose = ''
-
-    def format(self, width: int = 2, score_only: bool = False,
-               signature: str = '', is_json: bool = False) -> str:
+        self._verbose = ""
+
+    def format(
+        self,
+        width: int = 2,
+        score_only: bool = False,
+        signature: str = "",
+        is_json: bool = False,
+    ) -> str:
         """Returns a pretty representation of the score.
         :param width: Floating point decimal precision width.
         :param score_only: If `True`, and the format is not `json`,
@@ -46,43 +52,43 @@ def format(self, width: int = 2, score_only: bool = False,
         :return: A plain or JSON-formatted string representation.
         """
         d = {
-            'name': self.name,
-            'score': float(f'{self.score:.{width}f}'),
-            'signature': signature,
+            "name": self.name,
+            "score": float(f"{self.score:.{width}f}"),
+            "signature": signature,
         }
 
-        sc = f'{self.score:.{width}f}'
+        sc = f"{self.score:.{width}f}"
 
         if self._mean > 0:
-            confidence_mean = f'{self._mean:.{width}f}'
-            confidence_var = f'{self._ci:.{width}f}'
-            confidence_str = f'μ = {confidence_mean} ± {confidence_var}'
+            confidence_mean = f"{self._mean:.{width}f}"
+            confidence_var = f"{self._ci:.{width}f}"
+            confidence_str = f"μ = {confidence_mean} ± {confidence_var}"
 
-            sc += f' ({confidence_str})'
+            sc += f" ({confidence_str})"
             if is_json:
-                d['confidence_mean'] = float(confidence_mean)
-                d['confidence_var'] = float(confidence_var)
-                d['confidence'] = confidence_str
+                d["confidence_mean"] = float(confidence_mean)
+                d["confidence_var"] = float(confidence_var)
+                d["confidence"] = confidence_str
 
         # Construct full score line
         full_score = f"{self.name}|{signature}" if signature else self.name
         full_score = f"{full_score} = {sc}"
         if self._verbose:
-            full_score += f' {self._verbose}'
-            d['verbose_score'] = self._verbose
+            full_score += f" {self._verbose}"
+            d["verbose_score"] = self._verbose
 
         if score_only:
             return sc
 
         if is_json:
-            for param in signature.split('|'):
-                key, value = param.split(':')
+            for param in signature.split("|"):
+                key, value = param.split(":")
                 d[key] = value
             return json.dumps(d, indent=1, ensure_ascii=False)
 
         return full_score
 
-    def estimate_ci(self, scores: List['Score']):
+    def estimate_ci(self, scores: List["Score"]):
         """Takes a list of scores and stores mean, stdev and 95% confidence
         interval around the mean.
 
@@ -110,42 +116,44 @@ class Signature:
 
     :param args: key-value dictionary passed from the actual metric instance.
     """
+
     def __init__(self, args: dict):
         """`Signature` initializer."""
         # Global items that are shared across all metrics
         self._abbr = {
-            'version': 'v',
-            'nrefs': '#',
-            'test': 't',
-            'lang': 'l',
-            'subset': 'S',
-            'origlang': 'o',
-            'bs': 'bs',     # Bootstrap resampling trials
-            'ar': 'ar',     # Approximate randomization trials
-            'seed': 'rs',   # RNG's seed
+            "version": "v",
+            "nrefs": "#",
+            "test": "t",
+            "lang": "l",
+            "subset": "S",
+            "origlang": "o",
+            "bs": "bs",  # Bootstrap resampling trials
+            "ar": "ar",  # Approximate randomization trials
+            "seed": "rs",  # RNG's seed
         }
 
-        if 'num_refs' not in args:
+        if "num_refs" not in args:
             raise ValueError(
-                'Number of references unknown, please evaluate the metric first.')
+                "Number of references unknown, please evaluate the metric first."
+            )
 
-        num_refs = args['num_refs']
+        num_refs = args["num_refs"]
         if num_refs == -1:
             # Detect variable number of refs
-            num_refs = 'var'
+            num_refs = "var"
 
         # Global items that are shared across all metrics
         # None's will be ignored
         self.info = {
-            'version': __version__,
-            'nrefs': num_refs,
-            'bs': args.get('n_bootstrap', None),
-            'ar': None,
-            'seed': args.get('seed', None),
-            'test': args.get('test_set', None),
-            'lang': args.get('langpair', None),
-            'origlang': args.get('origlang', None),
-            'subset': args.get('subset', None),
+            "version": __version__,
+            "nrefs": num_refs,
+            "bs": args.get("n_bootstrap", None),
+            "ar": None,
+            "seed": args.get("seed", None),
+            "test": args.get("test_set", None),
+            "lang": args.get("langpair", None),
+            "origlang": args.get("origlang", None),
+            "subset": args.get("subset", None),
         }
 
     def format(self, short: bool = False) -> str:
@@ -157,17 +165,17 @@ def format(self, short: bool = False) -> str:
         pairs = []
         keys = list(self.info.keys())
         # keep version always at end
-        keys.remove('version')
-        for name in keys + ['version']:
+        keys.remove("version")
+        for name in keys + ["version"]:
             value = self.info[name]
             if value is not None:
                 if isinstance(value, bool):
                     # Replace True/False with yes/no
-                    value = 'yes' if value else 'no'
+                    value = "yes" if value else "no"
                 final_name = self._abbr[name] if short else name
-                pairs.append(f'{final_name}:{value}')
+                pairs.append(f"{final_name}:{value}")
 
-        return '|'.join(pairs)
+        return "|".join(pairs)
 
     def update(self, key: str, value: Any):
         """Add a new item or update an existing one.
@@ -217,17 +225,18 @@ def _check_sentence_score_args(self, hyp: str, refs: Sequence[str]):
         err_msg = None
 
         if not isinstance(hyp, str):
-            err_msg = 'The argument `hyp` should be a string.'
+            err_msg = "The argument `hyp` should be a string."
         elif isinstance(refs, str) or not isinstance(refs, Sequence):
-            err_msg = 'The argument `refs` should be a sequence of strings.'
+            err_msg = "The argument `refs` should be a sequence of strings."
         elif not isinstance(refs[0], str) and refs[0] is not None:
-            err_msg = 'Each element of `refs` should be a string.'
+            err_msg = "Each element of `refs` should be a string."
 
         if err_msg:
-            raise TypeError(f'{prefix}: {err_msg}')
+            raise TypeError(f"{prefix}: {err_msg}")
 
-    def _check_corpus_score_args(self, hyps: Sequence[str],
-                                 refs: Optional[Sequence[Sequence[str]]]):
+    def _check_corpus_score_args(
+        self, hyps: Sequence[str], refs: Optional[Sequence[Sequence[str]]]
+    ):
         """Performs sanity checks on `corpus_score` method's arguments.
 
         :param hypses: A sequence of hypothesis strings.
@@ -242,7 +251,7 @@ def _check_corpus_score_args(self, hyps: Sequence[str],
         if not isinstance(hyps, Sequence):
             err_msg = "`hyps` should be a sequence of strings."
         elif not isinstance(hyps[0], str):
-            err_msg = 'Each element of `hyps` should be a string.'
+            err_msg = "Each element of `hyps` should be a string."
         elif any(line is None for line in hyps):
             err_msg = "Undefined line in hypotheses stream!"
 
@@ -255,7 +264,7 @@ def _check_corpus_score_args(self, hyps: Sequence[str],
                 err_msg = "`refs` should be a sequence of sequence of strings."
 
         if err_msg:
-            raise TypeError(f'{prefix}: {err_msg}')
+            raise TypeError(f"{prefix}: {err_msg}")
 
     @abstractmethod
     def _aggregate_and_compute(self, stats: List[List[Any]]) -> Any:
@@ -296,7 +305,9 @@ def _extract_reference_info(self, refs: Sequence[str]) -> Dict[str, Any]:
         pass
 
     @abstractmethod
-    def _compute_segment_statistics(self, hypothesis: str, ref_kwargs: Dict) -> List[Any]:
+    def _compute_segment_statistics(
+        self, hypothesis: str, ref_kwargs: Dict
+    ) -> List[Any]:
         """Given a (pre-processed) hypothesis sentence and already computed
         reference info, returns the best match statistics across the
         references. The return type is usually a List of ints or floats.
@@ -345,8 +356,9 @@ def _cache_references(self, references: Sequence[Sequence[str]]) -> List[Any]:
 
         return ref_cache
 
-    def _extract_corpus_statistics(self, hypotheses: Sequence[str],
-                                   references: Optional[Sequence[Sequence[str]]]) -> Any:
+    def _extract_corpus_statistics(
+        self, hypotheses: Sequence[str], references: Optional[Sequence[Sequence[str]]]
+    ) -> Any:
         """Reads the corpus and returns sentence-level match statistics for
         faster re-computations esp. during statistical tests.
 
@@ -363,14 +375,14 @@ def _extract_corpus_statistics(self, hypotheses: Sequence[str],
         elif self._ref_cache:
             ref_cache = self._ref_cache
         else:
-            raise RuntimeError('No references provided and the cache is empty.')
+            raise RuntimeError("No references provided and the cache is empty.")
 
         stats = []
         tok_count = 0
 
         for hyp, ref_kwargs in zip(hypotheses, ref_cache):
             # Check for already-tokenized input problem (only for BLEU)
-            if not self._force and hyp.endswith(' .'):
+            if not self._force and hyp.endswith(" ."):
                 tok_count += 1
 
             hyp = self._preprocess_segment(hyp)
@@ -380,8 +392,12 @@ def _extract_corpus_statistics(self, hypotheses: Sequence[str],
 
         if tok_count >= 100:
             sacrelogger.warning("That's 100 lines that end in a tokenized period ('.')")
-            sacrelogger.warning("It looks like you forgot to detokenize your test data, which may hurt your score.")
-            sacrelogger.warning("If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.")
+            sacrelogger.warning(
+                "It looks like you forgot to detokenize your test data, which may hurt your score."
+            )
+            sacrelogger.warning(
+                "If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter."
+            )
 
         return stats
 
@@ -395,12 +411,16 @@ def sentence_score(self, hypothesis: str, references: Sequence[str]) -> Any:
         self._check_sentence_score_args(hypothesis, references)
 
         stats = self._extract_corpus_statistics(
-            [hypothesis], [[refs] for refs in references])
+            [hypothesis], [[refs] for refs in references]
+        )
         return self._aggregate_and_compute(stats)
 
-    def corpus_score(self, hypotheses: Sequence[str],
-                     references: Optional[Sequence[Sequence[str]]],
-                     n_bootstrap: int = 1) -> Any:
+    def corpus_score(
+        self,
+        hypotheses: Sequence[str],
+        references: Optional[Sequence[Sequence[str]]],
+        n_bootstrap: int = 1,
+    ) -> Any:
         """Compute the metric for a corpus against a single (or multiple) reference(s).
 
         :param hypotheses: A sequence of hypothesis strings.