|
14 | 14 |
|
15 | 15 | """General float serializers using dedicated tokens."""
|
16 | 16 |
|
| 17 | +import math |
17 | 18 | import re
|
18 | 19 | from typing import Sequence, Union
|
19 | 20 |
|
|
23 | 24 | from optformer.common.serialization import tokens as tokens_lib
|
24 | 25 | import ordered_set
|
25 | 26 |
|
| 27 | +TokensSerializer = tokens_lib.TokenSerializer[Sequence[Union[str, int]]] |
| 28 | + |
26 | 29 |
|
27 | 30 | @gin.configurable
|
28 | 31 | @attrs.define
|
@@ -53,11 +56,9 @@ class DigitByDigitFloatTokenSerializer(
|
53 | 56 | num_digits: int = attrs.field(default=4)
|
54 | 57 | exponent_range: int = attrs.field(default=10)
|
55 | 58 |
|
56 |
| - tokens_serializer: tokens_lib.TokenSerializer[Sequence[Union[str, int]]] = ( |
57 |
| - attrs.field( |
58 |
| - kw_only=True, |
59 |
| - factory=tokens_lib.UnitSequenceTokenSerializer, |
60 |
| - ) |
| 59 | + tokens_serializer: TokensSerializer = attrs.field( |
| 60 | + kw_only=True, |
| 61 | + factory=tokens_lib.UnitSequenceTokenSerializer, |
61 | 62 | )
|
62 | 63 |
|
63 | 64 | @property
|
@@ -127,3 +128,95 @@ def from_str(self, s: str, /) -> float:
|
127 | 128 | exp = int(''.join(tokens[-1]).lstrip('E'))
|
128 | 129 |
|
129 | 130 | return float(sign * mantissa * 10**exp)
|
| 131 | + |
| 132 | + |
| 133 | +@attrs.define(kw_only=True) |
| 134 | +class IEEEFloatTokenSerializer( |
| 135 | + tokens_lib.CartesianProductTokenSerializer[float] |
| 136 | +): |
| 137 | + """More official float serializer, minimizing the use of dedicated tokens. |
| 138 | +
|
| 139 | + Follows IEEE-type standard. |
| 140 | +
|
| 141 | + A float f = `s * b^e * m` can be represented as [s, e, m] from most to least |
| 142 | + important, where: |
| 143 | + s: Positive/Negative sign (+, -) |
| 144 | + b: Base |
| 145 | + e: Exponent (left-most is a sign, digits represented with base b) |
| 146 | + m: Mantissa (represented with base b) |
| 147 | +
|
| 148 | + For example, 1.23456789e-222 can be represented as: |
| 149 | +
|
| 150 | + <+><-><2><2><2><1><2><3><4> |
| 151 | +
|
| 152 | + if b=10, num_exponent_digits=3, and num_mantissa_digits=4. |
| 153 | + """ |
| 154 | + |
| 155 | + base: int = attrs.field(default=10) |
| 156 | + |
| 157 | + num_exponent_digits: int = attrs.field(default=1) |
| 158 | + num_mantissa_digits: int = attrs.field(default=4) |
| 159 | + |
| 160 | + tokens_serializer: TokensSerializer = attrs.field( |
| 161 | + factory=tokens_lib.UnitSequenceTokenSerializer, |
| 162 | + ) |
| 163 | + |
| 164 | + @property |
| 165 | + def num_tokens_per_obj(self) -> int: |
| 166 | + return 2 + self.num_exponent_digits + self.num_mantissa_digits |
| 167 | + |
| 168 | + def tokens_used(self, index: int) -> ordered_set.OrderedSet[str]: |
| 169 | + if index < 0 or index >= self.num_tokens_per_obj: |
| 170 | + raise ValueError(f'Index {index} out of bounds.') |
| 171 | + |
| 172 | + if index in [0, 1]: # beginning |
| 173 | + tokens = [self.tokens_serializer.to_str([s]) for s in ['+', '-']] |
| 174 | + else: # middle (digit) |
| 175 | + tokens = [self.tokens_serializer.to_str([s]) for s in range(self.base)] |
| 176 | + return ordered_set.OrderedSet(tokens) |
| 177 | + |
| 178 | + def to_str(self, f: float, /) -> str: |
| 179 | + sign = '+' if f >= 0 else '-' |
| 180 | + abs_f = abs(f) |
| 181 | + exponent = math.floor(np.log(abs_f) / np.log(self.base)) if abs_f > 0 else 0 |
| 182 | + |
| 183 | + exponent_sign = '+' if exponent >= 0 else '-' |
| 184 | + abs_exponent = abs(exponent) |
| 185 | + |
| 186 | + e = np.base_repr(abs_exponent, base=self.base) |
| 187 | + if len(e) > self.num_exponent_digits: # Overflow, raise error for now. |
| 188 | + # TODO: Should we round or add 'inf' token? |
| 189 | + raise ValueError(f'Exponent {e} too large.') |
| 190 | + e = e.zfill(self.num_exponent_digits) |
| 191 | + |
| 192 | + mantissa = np.base_repr( |
| 193 | + abs_f * self.base ** (self.num_mantissa_digits - 1 - exponent), |
| 194 | + base=self.base, |
| 195 | + ) |
| 196 | + |
| 197 | + if len(mantissa) > self.num_mantissa_digits: |
| 198 | + mantissa = mantissa[: self.num_mantissa_digits] |
| 199 | + if len(mantissa) < self.num_mantissa_digits: # Right-pad with zeros. |
| 200 | + mantissa += '0' * (self.num_mantissa_digits - len(mantissa)) |
| 201 | + |
| 202 | + raw_str = sign + exponent_sign + e + mantissa |
| 203 | + return self.tokens_serializer.to_str(list(raw_str)) |
| 204 | + |
| 205 | + def from_str(self, s: str, /) -> float: |
| 206 | + tokens = self.tokens_serializer.from_str(s) |
| 207 | + |
| 208 | + sign = -1 if tokens[0] == '-' else 1 |
| 209 | + |
| 210 | + exponent_sign = -1 if tokens[1] == '-' else 1 |
| 211 | + |
| 212 | + abs_exponent_str = ''.join( |
| 213 | + map(str, tokens[2 : 2 + self.num_exponent_digits]) |
| 214 | + ) |
| 215 | + abs_exponent = int(abs_exponent_str, base=self.base) |
| 216 | + exponent = exponent_sign * abs_exponent |
| 217 | + |
| 218 | + mantissa_str = ''.join(map(str, tokens[2 + self.num_exponent_digits :])) |
| 219 | + mantissa_unscaled = int(mantissa_str, base=self.base) |
| 220 | + mantissa = mantissa_unscaled / self.base ** (self.num_mantissa_digits - 1) |
| 221 | + |
| 222 | + return sign * (self.base**exponent) * mantissa |
0 commit comments