Skip to content

Commit 0d51c30

Browse files
authored
Merge pull request #6180 from tfausak/gh-2228-numeric-underscores
Support underscores in numeric literals
2 parents a7d22ae + e5c5b2e commit 0d51c30

File tree

7 files changed

+253
-21
lines changed

7 files changed

+253
-21
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
fcee0da446c9e2aac1bd4e0bd4d64d7573ad529fb1f98583fb9393ab80700d15 5c585a7c2f666e019f492bfd2cc4c868fee1beb726b41ddf857d7bbc1778f8d8 pass
1+
af36334ae265cc41a9308b6817c6b59c741687cd9ccef3e985a80e993f0a9004 5c585a7c2f666e019f492bfd2cc4c868fee1beb726b41ddf857d7bbc1778f8d8 pass

.github/workflows/proofs/tests.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
76183b556c0b69d0d0b49d2b5d53c7fa3a5fb5460bdddd8331517cc944070f6e bb475eb1950a60d5a3b214802e10b997d5b6d7fa47df5a1ed69c56b37f94fd85 pass
1+
7a07a257b3e27ddb1c591e833e1baa04bd190fe341b03ade982b086d6dd3799c bb475eb1950a60d5a3b214802e10b997d5b6d7fa47df5a1ed69c56b37f94fd85 pass
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
4ff654e11c212dd3b65a6d4d181ac0f7cac0eab5691ed87edad3c9c00d3e4b79 d87764682d1e2b3d3a479e600c6acddb5dbd9795c7b64747a67f6807d438dcc2 pass
1+
ce1677f3d2eddca71c46d6e32b754d12491777f69f83d6730ac7e332c21cc299 d87764682d1e2b3d3a479e600c6acddb5dbd9795c7b64747a67f6807d438dcc2 pass

CONTRIBUTORS.markdown

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,4 @@ The format for this list: name, GitHub handle
9898
* Nic Luciano (@kn0ll)
9999
* Maurice Scheffmacher (@MauScheff)
100100
* ChanningWalton (@channingwalton)
101+
* Taylor Fausak (@tfausak)
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Underscore separators in numeric literals
2+
3+
Unison supports underscores as visual separators in numeric literals.
4+
Underscores can appear between digits in any numeric base but are
5+
stripped before evaluation.
6+
7+
``` ucm :hide
8+
> builtins.merge
9+
```
10+
11+
## Valid literals
12+
13+
Decimal integers:
14+
15+
``` unison
16+
> 1_000
17+
> 1_000_000
18+
> +1_000
19+
> -1_000
20+
```
21+
22+
``` ucm :added-by-ucm
23+
Loading changes detected in scratch.u.
24+
25+
No changes found.
26+
27+
1 | > 1_000
28+
29+
1000
30+
31+
2 | > 1_000_000
32+
33+
1000000
34+
35+
3 | > +1_000
36+
37+
+1000
38+
39+
4 | > -1_000
40+
41+
-1000
42+
```
43+
44+
Floating-point and scientific notation:
45+
46+
``` unison
47+
> 1_000.5
48+
> 1_000.000_001
49+
> 1_000e1_0
50+
> 1_000.5e1_0
51+
```
52+
53+
``` ucm :added-by-ucm
54+
Loading changes detected in scratch.u.
55+
56+
No changes found.
57+
58+
1 | > 1_000.5
59+
60+
1000.5
61+
62+
2 | > 1_000.000_001
63+
64+
1000.000001
65+
66+
3 | > 1_000e1_0
67+
68+
1.0e13
69+
70+
4 | > 1_000.5e1_0
71+
72+
1.0005e13
73+
```
74+
75+
Hexadecimal, octal, and binary:
76+
77+
``` unison
78+
> 0xFF_FF
79+
> 0o77_77
80+
> 0b1010_0101
81+
```
82+
83+
``` ucm :added-by-ucm
84+
Loading changes detected in scratch.u.
85+
86+
No changes found.
87+
88+
1 | > 0xFF_FF
89+
90+
65535
91+
92+
2 | > 0o77_77
93+
94+
4095
95+
96+
3 | > 0b1010_0101
97+
98+
165
99+
```
100+
101+
## Invalid literals
102+
103+
Trailing underscores and consecutive underscores are rejected:
104+
105+
``` unison :error
106+
x = 1_
107+
```
108+
109+
``` ucm :added-by-ucm
110+
Loading changes detected in scratch.u.
111+
112+
I got confused here:
113+
114+
1 | x = 1_
115+
116+
117+
I was surprised to find a
118+
here.
119+
I was expecting one of these instead:
120+
121+
* decimal digit
122+
* end of input
123+
```
124+
125+
``` unison :error
126+
x = 1__2
127+
```
128+
129+
``` ucm :added-by-ucm
130+
Loading changes detected in scratch.u.
131+
132+
I got confused here:
133+
134+
1 | x = 1__2
135+
136+
137+
I was surprised to find a _ here.
138+
I was expecting one of these instead:
139+
140+
* decimal digit
141+
* end of input
142+
```
143+
144+
``` unison :error
145+
x = 0xFF_
146+
```
147+
148+
``` ucm :added-by-ucm
149+
Loading changes detected in scratch.u.
150+
151+
I got confused here:
152+
153+
1 | x = 0xFF_
154+
155+
156+
I was surprised to find a
157+
here.
158+
I was expecting one of these instead:
159+
160+
* hexadecimal digit
161+
```

unison-syntax/src/Unison/Syntax/Lexer/Unison.hs

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ where
3030

3131
import Control.Lens qualified as Lens
3232
import Control.Monad.State qualified as S
33-
import Data.Char (isAlphaNum, isDigit, isSpace, ord, toLower)
33+
import Data.Char (digitToInt, isAlphaNum, isDigit, isHexDigit, isOctDigit, isSpace, ord, toLower)
3434
import Data.Foldable qualified as Foldable
3535
import Data.Functor.Classes (Show1 (..), showsPrec1)
3636
import Data.List qualified as List
@@ -492,21 +492,21 @@ lexemes eof =
492492

493493
numeric = bytes <|> otherbase <|> float <|> intOrNat
494494
where
495-
intOrNat = P.try $ num <$> sign <*> LP.decimal
495+
intOrNat = P.try $ num <$> sign <*> (digitsToInteger 10 <$> digitsWithUnderscores "decimal digit" isDigit)
496496
float = do
497-
_ <- P.try (P.lookAhead (sign >> (LP.decimal :: P Int) >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this
497+
_ <- P.try (P.lookAhead (sign >> digitsWithUnderscores "decimal digit" isDigit >> (char '.' <|> char 'e' <|> char 'E'))) -- commit after this
498498
start <- posP
499499
sign <- fromMaybe "" <$> sign
500-
base <- P.takeWhile1P (Just "base") isDigit
500+
base <- digitsWithUnderscores "base" isDigit
501501
decimals <-
502502
P.optional $
503503
let missingFractional = err start (MissingFractional $ base <> ".")
504-
in liftA2 (<>) (lit ".") (P.takeWhile1P (Just "decimals") isDigit <|> missingFractional)
504+
in liftA2 (<>) (lit ".") (digitsWithUnderscores "decimals" isDigit <|> missingFractional)
505505
exp <- P.optional $ do
506506
e <- map toLower <$> (lit "e" <|> lit "E")
507507
sign <- fromMaybe "" <$> optional (lit "+" <|> lit "-")
508508
let missingExp = err start (MissingExponent $ base <> fromMaybe "" decimals <> e <> sign)
509-
exp <- P.takeWhile1P (Just "exponent") isDigit <|> missingExp
509+
exp <- digitsWithUnderscores "exponent" isDigit <|> missingExp
510510
pure $ e <> sign <> exp
511511
pure $ Numeric (sign <> base <> fromMaybe "" decimals <> fromMaybe "" exp)
512512

@@ -518,23 +518,33 @@ lexemes eof =
518518
Left _ -> err start (InvalidBytesLiteral $ "0xs" <> s)
519519
Right bs -> pure (Bytes bs)
520520
otherbase = octal <|> hex <|> binary
521-
octal = do
522-
start <- posP
523-
commitAfter2 sign (lit "0o") $ \sign _ ->
524-
fmap (num sign) LP.octal <|> err start InvalidOctalLiteral
525-
hex = do
526-
start <- posP
527-
commitAfter2 sign (lit "0x") $ \sign _ ->
528-
fmap (num sign) LP.hexadecimal <|> err start InvalidHexLiteral
529-
binary = do
521+
octal = baseWithPrefix "0o" 8 "octal digit" isOctDigit InvalidOctalLiteral
522+
hex = baseWithPrefix "0x" 16 "hexadecimal digit" isHexDigit InvalidHexLiteral
523+
binary = baseWithPrefix "0b" 2 "binary digit" isBinDigit InvalidBinaryLiteral
524+
525+
baseWithPrefix :: String -> Int -> String -> (Char -> Bool) -> Err -> P Lexeme
526+
baseWithPrefix prefix base label isValidDigit errType = do
530527
start <- posP
531-
commitAfter2 sign (lit "0b") $ \sign _ ->
532-
fmap (num sign) LP.binary <|> err start InvalidBinaryLiteral
528+
commitAfter2 sign (lit prefix) $ \sign _ ->
529+
fmap (num sign) (P.try $ digitsToInteger base <$> digitsWithUnderscores label isValidDigit)
530+
<|> err start errType
533531

534532
num :: Maybe String -> Integer -> Lexeme
535533
num sign n = Numeric (fromMaybe "" sign <> show n)
536534
sign = P.optional (lit "+" <|> lit "-")
537535

536+
isBinDigit :: Char -> Bool
537+
isBinDigit c = c == '0' || c == '1'
538+
539+
digitsWithUnderscores :: String -> (Char -> Bool) -> P String
540+
digitsWithUnderscores label isValidDigit = do
541+
first <- P.takeWhile1P (Just label) isValidDigit
542+
rest <- many (char '_' *> P.takeWhile1P (Just label) isValidDigit)
543+
pure $ mconcat $ first : rest
544+
545+
digitsToInteger :: Int -> String -> Integer
546+
digitsToInteger base = foldl' (\acc c -> acc * toInteger base + toInteger (digitToInt c)) 0
547+
538548
hash = Hash <$> P.try shortHashP
539549

540550
reserved :: P [Token Lexeme]

unison-syntax/test/Unison/Test/Unison.hs

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,55 @@ test =
207207
t "{foo\n,bar}" [Open "{", simpleWordyId "foo", Reserved ",", simpleWordyId "bar", Close],
208208
t "{foo\n ,bar}" [Open "{", simpleWordyId "foo", Reserved ",", simpleWordyId "bar", Close],
209209
t "[foo\n,bar]" [Open "[", simpleWordyId "foo", Reserved ",", simpleWordyId "bar", Close],
210-
t "[foo\n ,bar]" [Open "[", simpleWordyId "foo", Reserved ",", simpleWordyId "bar", Close]
210+
t "[foo\n ,bar]" [Open "[", simpleWordyId "foo", Reserved ",", simpleWordyId "bar", Close],
211+
-- Underscore separators in numeric literals (#2228)
212+
-- Decimal integers
213+
t "1_000" [Numeric "1000"],
214+
t "+1_000" [Numeric "+1000"],
215+
t "-1_000" [Numeric "-1000"],
216+
t "1_000_000" [Numeric "1000000"],
217+
-- Floats
218+
t "1_000.5" [Numeric "1000.5"],
219+
t "1_000.000_001" [Numeric "1000.000001"],
220+
-- Scientific notation
221+
t "1_000e1_0" [Numeric "1000e10"],
222+
t "1_000.5e1_0" [Numeric "1000.5e10"],
223+
t "1_000.5E1_0" [Numeric "1000.5e10"],
224+
t "+1_000.5e1_0" [Numeric "+1000.5e10"],
225+
t "-1_000.5e-1_0" [Numeric "-1000.5e-10"],
226+
-- Hex
227+
t "0xFF_FF" [Numeric "65535"],
228+
t "+0xFF_FF" [Numeric "+65535"],
229+
t "-0xFF_FF" [Numeric "-65535"],
230+
-- Octal
231+
t "0o77_77" [Numeric "4095"],
232+
-- Binary
233+
t "0b1010_0101" [Numeric "165"],
234+
t "+0b1010_0101" [Numeric "+165"],
235+
-- Trailing and consecutive underscores are rejected
236+
tError "1_",
237+
tError "1__2",
238+
tError "1_000_",
239+
tError "0xFF_",
240+
tError "0xFF__FF",
241+
tError "0o77_",
242+
tError "0b1010_",
243+
-- Underscore followed by non-digit is rejected (not parsed as two tokens)
244+
tError "1_x",
245+
tError "1_e3",
246+
-- Underscore immediately after base prefix is rejected
247+
tError "0x_FF",
248+
-- Underscore adjacent to period or exponent marker is rejected
249+
tError "1_.2",
250+
tError "1._2",
251+
tError "1e_2",
252+
tError "1_e2",
253+
-- Underscore after exponent sign is rejected
254+
tError "1e+_2",
255+
tError "1e-_2",
256+
-- Leading zeros with underscores
257+
t "0_1" [Numeric "1"],
258+
t "007" [Numeric "7"]
211259
]
212260

213261
t :: String -> [Lexeme] -> Test ()
@@ -226,6 +274,18 @@ t s expected = case toList . preParse $ lexer filename s of
226274
where
227275
filename = "test case"
228276

277+
tError :: String -> Test ()
278+
tError s = scope s $ case toList . preParse $ lexer filename s of
279+
ts
280+
| any isErr ts -> ok
281+
| otherwise -> do
282+
note $ "expected error but got: " ++ show (payload <$> ts)
283+
crash "expected error"
284+
where
285+
filename = "test case"
286+
isErr (Token (Err _) _ _) = True
287+
isErr _ = False
288+
229289
simpleSymbolyId :: Text -> Lexeme
230290
simpleSymbolyId =
231291
SymbolyId . HQ'.unsafeParseText

0 commit comments

Comments
 (0)