Add more vocabs (Hamming and RepeatingErrorCorrection)

xingyousong · copybara-github · commit 6c9409aeca1d · 2025-01-30T18:38:24.000-08:00
PiperOrigin-RevId: 718413874
diff --git a/optformer/decoding_regression/vocabs.py b/optformer/decoding_regression/vocabs.py
@@ -15,6 +15,7 @@
 """Vocabularies for encoding/decoding floats."""
 
 import abc
+import itertools
 import re
 
 import attrs
@@ -98,7 +99,7 @@ def from_int(self, token_ids: list[int]) -> float:
 
 
 @attrs.define
-class NormalizedVocab:
+class NormalizedVocab(FloatVocab):
   """Vocab which supports only numbers within [0,1]."""
 
   base: int = attrs.field(default=2)
@@ -135,3 +136,102 @@ def from_int(self, token_ids: list[int]) -> float:
     x = np.asarray(token_ids)
     coeff = np.power(self.base, -1 * np.arange(1, len(x) + 1), dtype=np.float32)
     return float(np.sum(x * coeff))
+
+
+@attrs.define
+class HammingDistanceVocab(FloatVocab):
+  """Based on https://ieeexplore.ieee.org/document/8437644.
+
+  Minimizes magnitude change on the float if the tokenization is corrupted. For
+  now only works on base=2.
+  """
+
+  base: int = attrs.field(default=2)
+  length: int = attrs.field(default=1)
+
+  all_binary_sequences = attrs.field(init=False)
+
+  def __attrs_post_init__(self):
+    if self.base != 2:
+      raise ValueError('Only base=2 is supported.')
+
+    self.all_binary_sequences = []
+    for i in range(self.length + 1):
+      for comb in itertools.combinations(range(self.length), i):
+        binary_seq = [0] * self.length
+        for c in comb:
+          binary_seq[c] = 1
+        self.all_binary_sequences.append(tuple(binary_seq))
+    self.all_binary_sequences.sort(key=lambda seq: (sum(seq), seq))
+
+  @property
+  def size(self) -> int:
+    return self.base
+
+  @property
+  def token_length(self) -> int:
+    return self.length
+
+  def logit_mask(self, index: int):
+    del index
+    return np.ones(self.size, dtype=bool)
+
+  def to_int(self, f: float) -> list[int]:
+    if not 0 <= f <= 1:
+      raise ValueError(f'f must be between 0 and 1, got {f}')
+
+    f_int = int(f * self.base**self.length)
+    if f_int == self.base**self.length:
+      f_int -= 1  # Adjust for the edge case when f is exactly 1
+    return list(self.all_binary_sequences[f_int])
+
+  def from_int(self, token_ids: list[int]) -> float:
+    if len(token_ids) != self.length:
+      raise ValueError(f'Length {len(token_ids)} does not match {self.length}.')
+    if not all(0 <= tid < self.base for tid in token_ids):
+      raise ValueError(f'{token_ids} out of range(0, {self.base})')
+
+    ind = self.all_binary_sequences.index(tuple(token_ids))
+    return float(ind) / (self.base**self.length)
+
+
+@attrs.define
+class RepeatingVocab(FloatVocab):
+  """Performs error correction by majority voting on decoded tokens."""
+
+  base_vocab: FloatVocab = attrs.field()
+  num_repeats: int = attrs.field(default=1)
+
+  @property
+  def size(self) -> int:
+    return self.base_vocab.size
+
+  @property
+  def token_length(self) -> int:
+    return self.base_vocab.token_length * self.num_repeats
+
+  def logit_mask(self, index: int):
+    true_index = index % self.base_vocab.token_length
+    return self.base_vocab.logit_mask(true_index)
+
+  def to_int(self, f: float) -> list[int]:
+    return self.base_vocab.to_int(f) * self.num_repeats
+
+  def from_int(self, token_ids: list[int]) -> float:
+    if len(token_ids) != self.token_length:
+      raise ValueError(
+          f'Expected {self.token_length} tokens, got {len(token_ids)}'
+      )
+
+    # Reshape repeats into array.
+    tokens = np.array(token_ids).reshape(
+        self.num_repeats, self.base_vocab.token_length
+    )
+
+    # Perform majority voting on each column (token).
+    voted_tokens = np.apply_along_axis(
+        lambda x: np.bincount(x).argmax(), axis=0, arr=tokens
+    )
+
+    # Convert the voted tokens to a float using the base vocabulary.
+    return self.base_vocab.from_int(voted_tokens.tolist())