-
Notifications
You must be signed in to change notification settings - Fork 85
/
Copy pathpanLexerScript.sml
323 lines (288 loc) · 9.69 KB
/
panLexerScript.sml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
(**
* A lexer for the Pancake language.
*)
(**
* We take significant inspiration from the Spark ADA development.
*
* Author: Craig McLaughlin
* Date: March--April 2022
*)
open HolKernel Parse boolLib bossLib stringLib numLib;
open arithmeticTheory stringTheory intLib listTheory locationTheory;
open ASCIInumbersTheory ASCIInumbersLib;
open mlstringTheory;
val _ = new_theory "panLexer";
Datatype:
keyword = SkipK | StK | StwK | St8K | St32K | IfK | ElseK | WhileK
| BrK | ContK | RaiseK | RetK | TicK | VarK | WithK | HandleK | BiwK
| LdsK | Ld8K | LdwK | Ld32K | BaseK | InK | FunK | ExportK | TrueK | FalseK
End
Datatype:
token = AndT | OrT | BoolAndT | BoolOrT | XorT | NotT
| EqT | NeqT | LessT | GreaterT | GeqT | LeqT | LowerT | HigherT | HigheqT | LoweqT
| PlusT | MinusT | DotT | StarT
| LslT | LsrT | AsrT | RorT
| IntT int | IdentT string | ForeignIdent string (* @ffi_str except @base *)
| LParT | RParT | CommaT | SemiT | ColonT | DArrowT | AddrT
| LBrakT | RBrakT | LCurT | RCurT
| AssignT
| StaticT
| KeywordT keyword
| AnnotCommentT string
| LexErrorT mlstring
End
Datatype:
atom = NumberA int | WordA string | SymA string | ErrA mlstring | AnnotCommentA string
End
Definition isAtom_singleton_def:
isAtom_singleton c = MEM c "+-^*().,;:[]{}"
End
Definition isAtom_begin_group_def:
(* # is for only for RorT,
* and should remove it to avoid collision with
* C-preprocessor later *)
isAtom_begin_group c = MEM c "#=><!&|"
End
Definition isAtom_in_group_def:
isAtom_in_group c = MEM c "=<>|&+"
End
Definition isAlphaNumOrWild_def:
isAlphaNumOrWild c ⇔ isAlphaNum c ∨ c = #"_"
End
Definition isLexErrorT_def:
(isLexErrorT (LexErrorT _) ⇔ T) ∧
(isLexErrorT _ ⇔ F)
End
Definition dest_lexErrorT_def:
(destLexErrorT (LexErrorT msg) = SOME msg) ∧
destLexErrorT _ = NONE
End
Definition get_token_def:
get_token s =
if s = "&&" then BoolAndT else
if s = "||" then BoolOrT else
if s = "&" then AndT else
if s = "|" then OrT else
if s = "^" then XorT else
if s = "==" then EqT else
if s = "!=" then NeqT else
if s = "<" then LessT else
if s = ">" then GreaterT else
if s = ">=" then GeqT else
if s = "<=" then LeqT else
if s = "<+" then LowerT else
if s = ">+" then HigherT else
if s = ">=+" then HigheqT else
if s = "<=+" then LoweqT else
if s = "=>" then DArrowT else
if s = "!" then NotT else
if s = "+" then PlusT else
if s = "-" then MinusT else
if s = "*" then StarT else
if s = "." then DotT else
if s = "<<" then LslT else
if s = ">>>" then LsrT else
if s = ">>" then AsrT else
if s = "#>>" then RorT else
if s = "(" then LParT else
if s = ")" then RParT else
if s = "," then CommaT else
if s = ";" then SemiT else
if s = ":" then ColonT else
if s = "[" then LBrakT else
if s = "]" then RBrakT else
if s = "{" then LCurT else
if s = "}" then RCurT else
if s = "=" then AssignT else
LexErrorT $ concat [«Unrecognised symbolic token: »; implode s]
End
Definition get_keyword_def:
get_keyword s =
if s = "skip" then (KeywordT SkipK) else
if s = "st" then (KeywordT StK) else
if s = "stw" then (KeywordT StwK) else
if s = "st8" then (KeywordT St8K) else
if s = "st32" then (KeywordT St32K) else
if s = "if" then (KeywordT IfK) else
if s = "else" then (KeywordT ElseK) else
if s = "while" then (KeywordT WhileK) else
if s = "break" then (KeywordT BrK) else
if s = "continue" then (KeywordT ContK) else
if s = "raise" then (KeywordT RaiseK) else
if s = "return" then (KeywordT RetK) else
if s = "tick" then (KeywordT TicK) else
if s = "var" then (KeywordT VarK) else
if s = "in" then (KeywordT InK) else
if s = "with" then (KeywordT WithK) else
if s = "handle" then (KeywordT HandleK) else
if s = "lds" then (KeywordT LdsK) else
if s = "ldw" then (KeywordT LdwK) else
if s = "ld8" then (KeywordT Ld8K) else
if s = "ld32" then (KeywordT Ld32K) else
if s = "@base" then (KeywordT BaseK) else
if s = "@biw" then (KeywordT BiwK) else
if s = "true" then (KeywordT TrueK) else
if s = "false" then (KeywordT FalseK) else
if s = "fun" then (KeywordT FunK) else
if s = "export" then (KeywordT ExportK) else
if s = "" then LexErrorT $ «Expected keyword, found empty string» else
if 2 <= LENGTH s ∧ EL 0 s = #"@" then ForeignIdent (DROP 1 s)
else
IdentT s
End
Definition token_of_atom_def:
token_of_atom a =
case a of
| NumberA i => IntT i
| WordA s => get_keyword s
| SymA s => get_token s
| ErrA s => LexErrorT s
| AnnotCommentA s => AnnotCommentT s
End
Definition read_while_def:
(read_while P "" s = (IMPLODE (REVERSE s), "")) ∧
(read_while P (STRING c cs) s =
if P c then read_while P cs (c :: s)
else (IMPLODE (REVERSE s), STRING c cs))
End
Theorem read_while_thm:
∀P input s s' rest.
read_while P input s = (s', rest) ⇒ LENGTH rest ≤ LENGTH input
Proof
Induct_on `input` >> rw[read_while_def] >> gvs[]
>> metis_tac[DECIDE “x ≤ y ⇒ x ≤ SUC y”]
QED
Definition next_loc_def:
next_loc n (POSN r c) = POSN r (c+n) ∧
next_loc n x = x
End
Definition next_line_def:
next_line (POSN r c) = POSN (r+1) 0 ∧
next_line x = x
End
Definition loc_row_def:
loc_row n = POSN n 1
End
Definition skip_comment_def:
skip_comment "" _ _ = NONE ∧
skip_comment (x::xs) loc i =
(case x of
| #"\n" => SOME (next_line loc, i + 1n)
| _ => skip_comment xs (next_loc 1 loc) (i + 1n))
End
(* return SOME (loc, i, j) -> i characters in comment, DROP j characters to
continue parsing (after the closing '*/') at loc *)
Definition skip_block_comment_def:
skip_block_comment "" _ _ = NONE ∧
skip_block_comment [_] _ _ = NONE ∧
skip_block_comment (x::y::xs) loc i =
if (x = #"*" ∧ y = #"/") ∨ (x = #"@" ∧ y = #"/") then
SOME (next_loc 2 loc, i, i + 2)
else if x = #"\n" then
skip_block_comment (y::xs) (next_line loc) (i + 1n)
else
skip_block_comment (y::xs) (next_loc 1 loc) (i + 1n)
End
Definition unhex_alt_def:
unhex_alt x = if isHexDigit x then UNHEX x else 0n
End
Definition num_from_dec_string_alt_def:
num_from_dec_string_alt = s2n 10 unhex_alt
End
Definition next_atom_def:
next_atom "" _ = NONE ∧
next_atom (c::cs) loc =
if c = #"\n" then (* Skip Newline *)
next_atom cs (next_line loc)
else if isSpace c then
next_atom cs (next_loc 1 loc)
else if isDigit c then
let (n, cs') = read_while isDigit cs [c] in
SOME (NumberA &(num_from_dec_string_alt n),
Locs loc (next_loc (LENGTH n) loc),
cs')
else if c = #"-" ∧ cs ≠ "" ∧ isDigit (HD cs) then
let (n, rest) = read_while isDigit cs [] in
SOME (NumberA (0 - &(num_from_dec_string_alt n)),
Locs loc (next_loc (LENGTH n) loc),
rest)
else if isPREFIX "//" (c::cs) then (* comment *)
(case (skip_comment (TL cs) (next_loc 2 loc) 0n) of
| NONE => SOME (ErrA «Malformed comment», Locs loc (next_loc 2 loc), "")
| SOME (loc', len) => next_atom (DROP (len + 1) cs) loc')
else if isPREFIX "/@" (c::cs) then (* annotation block comment *)
(case (skip_block_comment (TL cs) (next_loc 3 loc) 0n) of
| NONE => SOME (ErrA «Malformed comment», Locs loc (next_loc 3 loc), "")
| SOME (loc', i, len) => SOME (AnnotCommentA (TAKE i (TL cs)),
Locs loc loc', DROP (len + 1) cs))
else if isPREFIX "/*" (c::cs) then (* block comment *)
(case (skip_block_comment (TL cs) (next_loc 2 loc) 0n) of
| NONE => SOME (ErrA «Malformed comment», Locs loc (next_loc 2 loc), "")
| SOME (loc', _, len) => next_atom (DROP (len + 1) cs) loc')
else if isAtom_singleton c then
SOME (SymA (STRING c []), Locs loc loc, cs)
else if isAtom_begin_group c then
let (n, rest) = read_while isAtom_in_group cs [c] in
SOME (SymA n, Locs loc (next_loc (LENGTH n - 1) loc), rest)
else if isAlpha c ∨ c = #"@" then (* read identifier *)
let (n, rest) = read_while isAlphaNumOrWild cs [c] in
SOME (WordA n, Locs loc (next_loc (LENGTH n) loc), rest)
else (* input not recognised *)
SOME (ErrA $ concat [«Unrecognised symbol: »; str c], Locs loc loc, cs)
Termination
WF_REL_TAC `measure (LENGTH o FST)`
\\ simp []
End
Theorem next_atom_LESS:
∀input l s l' rest.
next_atom input l = SOME (s, l', rest) ⇒ LENGTH rest < LENGTH input
Proof
recInduct next_atom_ind \\ simp [next_atom_def]
\\ rw []
\\ TRY (first_x_assum drule)
\\ fs [miscTheory.UNCURRY_eq_pair, CaseEq "option", CaseEq "prod"]
\\ gvs []
\\ imp_res_tac read_while_thm
\\ fs []
\\ BasicProvers.EVERY_CASE_TAC \\ fs []
QED
Definition next_token_def:
next_token input loc =
case next_atom input loc of
| NONE => NONE
| SOME (atom, locs, rest) =>
SOME (token_of_atom atom, locs, rest)
End
Theorem next_token_LESS:
∀s l l' rest input. (next_token input l = SOME (s, l', rest)) ⇒
LENGTH rest < LENGTH input
Proof
rw[next_token_def, AllCaseEqs()] >> metis_tac[next_atom_LESS]
QED
Definition init_loc_def:
init_loc = POSN 1 1
End
Definition pancake_lex_aux_def:
pancake_lex_aux input loc =
(case next_token input loc of
| NONE => []
| SOME (token, Locs locB locE, rest) =>
(token, Locs locB locE) :: pancake_lex_aux rest locE)
Termination
WF_REL_TAC ‘measure (LENGTH o FST)’ >> rw[]
>> imp_res_tac next_token_LESS
End
Definition pancake_lex_def:
pancake_lex input = pancake_lex_aux input init_loc
End
Definition safe_pancake_lex_def:
safe_pancake_lex input =
(let output = pancake_lex input in
case FILTER (isLexErrorT ∘ FST) output of
[] => INL output
| xs => INR $ MAP ((the «» ∘ destLexErrorT) ## I) xs)
End
(* Tests :
EVAL ``pancake_lex "x + 1 --Then check y\n && y - 2 <= -3 || !z"``;
*)
val _ = export_theory();