-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy pathMachineTypeScript.sml
351 lines (316 loc) · 10.3 KB
/
MachineTypeScript.sml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
(**
f machine-precision as a datatype for mixed-precision computations
@author: Raphael Monat
@maintainer: Heiko Becker
**)
open realTheory realLib;
open RealSimpsTheory;
open preambleFloVer;
val _ = new_theory "MachineType";
Overload abs[local] = “realax$abs”
val _ = monadsyntax.enable_monadsyntax();
val _ = List.app monadsyntax.enable_monad ["option"];
Datatype:
mType = REAL | M16 | M32 | M64 | F num num bool (* first num is word length, second is fractional bits, bool is for sign of fractional bits *)
End
Definition isFixedPoint_def :
isFixedPoint (F w f s) = T /\
isFixedPoint _ = F
End
Definition maxExponent_def:
(maxExponent (REAL) = 0n) /\
(maxExponent (M16) = 15) /\
(maxExponent (M32) = 127) /\
(maxExponent (M64) = 1023) /\
(maxExponent (F w f s) = f)
End
Definition minExponentPos_def:
(minExponentPos (REAL) = 0n) /\
(minExponentPos (M16) = 14) /\
(minExponentPos (M32) = 126) /\
(minExponentPos (M64) = 1022) /\
(minExponentPos (F w f s) = f)
End
(**
Goldberg - Handbook of Floating-Point Arithmetic: (p.183)
(𝛃 - 𝛃^(1 - p)) * 𝛃^(e_max)
which simplifies to 2^(e_max) for base 2
**)
Definition maxValue_def:
maxValue (m:mType) =
case m of
| F w f s => &((2n ** (w-1)) - 1) * (if s then &(2n ** (maxExponent m)) else inv (&(2n ** (maxExponent m))))
| _ => (&(2n ** (maxExponent m))):real
End
(** Using the sign bit, the very same number is representable as a negative number,
thus just apply negation here **)
Definition minValue_pos_def:
minValue_pos (m:mType) =
case m of
| F w f s => 0
| _ => inv (&(2n ** (minExponentPos m)))
End
(** Goldberg - Handbook of Floating-Point Arithmetic: (p.183)
𝛃^(e_min -p + 1) = 𝛃^(e_min -1) for base 2
**)
Definition normal_def:
normal (v:real) (m:mType) =
(minValue_pos m <= abs v /\ abs v <= maxValue m)
End
Definition denormal_def:
denormal (v:real) (m:mType) =
case m of
| REAL => F
| _ => ((abs v) < (minValue_pos m) /\ v <> 0)
End
(** Return an upperbound on the error for value v committed by rounding
For real numbers this is 0,
For floating-point numbers, we have to first check whether the the value is
denormal or normal. For denormal numbers, we use the upper bound
1/2 * 2^(-e_min - p + 1) = 2^(-1) * 2^(-e_min - p + 1)) =
2 ^(-1 - e_min - p + 1) = 2^(-(e_min + p)) = inv (2 ^(e_min + p))
where p is the number of precision bits for the floating-point type
For normal numbers the roundoff error is at most
1/2 * 2^(1 - p) = 2^(-1) * 2^(1-p) = 2^(-1 + 1 -p) = 2^(-p) = inv (2^p)
where p is the number of precision bits.
For fixed-point numbers with f fractional bits, the truncation error is at
maximum inv (2 pow f)
see p. 24 of Handbook of Floating-Point Arithmetic for details *)
Definition mTypeToR_def:
mTypeToR (m:mType) (v:real):real =
case m of
| REAL => 0
| M16 => if (denormal v M16)
then inv (2 pow (minExponentPos M16 + 11))
else inv (2 pow 11)
| M32 => if (denormal v M32)
then inv(2 pow (minExponentPos M32 + 24))
else inv (2 pow 24)
| M64 => if (denormal v M64)
then inv (2 pow (minExponentPos M64 + 53))
else inv (2 pow 53)
| F w f s => if s then (2 pow f) else inv (2 pow f)
End
Theorem mTypeToR_simp =
SIMP_RULE std_ss
(List.map EVAL [“minExponentPos M16”, “minExponentPos M32”, “minExponentPos M64”])
mTypeToR_def
(** Compute the maximum error contributed by rounding a value v to type m,
does not account for propagation errors, these must be accounted elsewhere *)
Definition computeError_def:
computeError v m =
case m of
| REAL => 0
| F w f s => mTypeToR m v
| _ => if (denormal v m) then mTypeToR m v else abs v * mTypeToR m v
End
Theorem mTypeToR_pos:
∀ m v. 0 ≤ mTypeToR m v
Proof
Cases_on `m` \\ fs[mTypeToR_def]
\\ rpt strip_tac \\ COND_CASES_TAC \\ fs[]
QED
Theorem computeError_pos:
0 ≤ computeError v m
Proof
fs[computeError_def]
\\ Cases_on ‘m’ \\ fs[mTypeToR_def]
\\ COND_CASES_TAC \\ fs[]
QED
Theorem normal_not_denormal:
normal x m ⇒ ~ denormal x m
Proof
fs[normal_def, denormal_def] \\ Cases_on ‘m’ \\ fs[REAL_NOT_LT]
QED
(** Show that the roundoff error contributed by normal numbers is worse than the
roundoff error contributed of any possible denormal number.
This is the key theorem for integrating denormals with the 1+delta model *)
Theorem computeError_up:
∀ v a b m.
abs v <= maxAbs (a,b) ⇒
(* normal (maxAbs(a,b)) m ⇒ *)
computeError v m <= computeError (maxAbs (a,b)) m
Proof
rpt strip_tac \\ Cases_on ‘m’
\\ fs[Excl "RMUL_LEQNORM", mTypeToR_def, computeError_def, maxAbs_def]
\\ imp_res_tac normal_not_denormal
\\ fs[Excl "RMUL_LEQNORM"]
\\ ntac 2 COND_CASES_TAC
\\ TRY (irule REAL_LE_LMUL_IMP \\ fs[] \\ irule REAL_LE_TRANS
\\ qexists_tac ‘max (abs a) (abs b)’ \\ fs[ABS_LE])
\\ fs[Excl "RMUL_LEQNORM", normal_def]
\\ TRY (fs[Excl "RMUL_LEQNORM", denormal_def, REAL_NOT_LT]
\\ ‘abs (max (abs a) (abs b)) = max (abs a) (abs b)’
by (fs[REAL_LE_MAX])
\\ ‘max (abs a) (abs b) < max (abs a) (abs b)’ suffices_by (fs[])
\\ RealArith.REAL_ASM_ARITH_TAC)
\\ fs[Excl "RMUL_LEQNORM", denormal_def]
\\ TRY (‘abs v ≤ 0’ suffices_by (fs[])
\\ pop_assum (rewrite_tac o single o GSYM)
\\ fs[] \\ NO_TAC)
\\ irule REAL_LE_TRANS
THENL (map qexists_tac
[‘1 / 2048 * minValue_pos M16’,
‘1 / 16777216 * minValue_pos M32’,
‘1 / 9007199254740992 * minValue_pos M64’])
\\ conj_tac \\ TRY (irule REAL_LE_LMUL_IMP \\ fs[REAL_NOT_LT])
\\ fs[minExponentPos_def, minValue_pos_def]
QED
(**
Check if machine precision m1 is more precise than machine precision m2.
REAL is real-valued evaluation, hence the most precise.
All floating-point types are compared by
mTypeToQ m1 <= mTypeToQ m2
All fixed-point types are compared by comparing only the word size
We consider a 32 bit fixed-point number to be more precise than a 16 bit
fixed-point number, no matter what the fractional bits may be.
For the moment, fixed-point and floating-point formats are incomparable.
**)
Definition isMorePrecise_def:
isMorePrecise (m1:mType) (m2:mType) =
case m1, m2 of
| REAL, _ => T
| F w1 f1 s1, F w2 f2 s2 =>
if s1 then
if s2 then
(w2 ≤ w1 ∧ f2 ≤ f1)
else
(w2 ≤ w1)
else if s2 then F
else (w2 ≤ w1 ∧ f2 ≤ f1)
| F w f s, _ => F
| _, F w f s=> F
| M16, M16 => T
| M32, M32 => T
| M32, M16 => T
| M64, REAL => F
| M64, _ => T
| _, _ => F
End
(**
More efficient version for performance, unfortunately we cannot do better
for fixed-points
**)
Definition morePrecise_def:
(morePrecise REAL _ = T) /\
(morePrecise (F w1 f1 s1) (F w2 f2 s2) =
(if s1 then
if s2 then
(w2 ≤ w1 ∧ f2 ≤ f1)
else
(w2 ≤ w1)
else if s2 then F
else (w2 ≤ w1 ∧ f2 ≤ f1))) ∧
(morePrecise (F w f s) _ = F) /\
(morePrecise _ (F w f s) = F) /\
(morePrecise M16 M16 = T) /\
(morePrecise M32 M32 = T) /\
(morePrecise M32 M16 = T) /\
(morePrecise M64 REAL = F) /\
(morePrecise M64 _ = T) /\
(morePrecise _ _ = F)
End
Theorem morePrecise_trans:
!m1 m2 m3.
morePrecise m1 m2 /\
morePrecise m2 m3 ==>
morePrecise m1 m3
Proof
rpt strip_tac
\\ Cases_on `m1` \\ Cases_on `m2` \\ Cases_on `m3`
\\ fs[morePrecise_def]
\\ every_case_tac \\ gs[]
QED
Theorem isMorePrecise_morePrecise:
!m1 m2.
isMorePrecise m1 m2 = morePrecise m1 m2
Proof
rpt strip_tac
\\ Cases_on `m1` \\ Cases_on `m2`
\\ once_rewrite_tac [morePrecise_def, isMorePrecise_def]
\\ fs[morePrecise_def, isMorePrecise_def, mTypeToR_def]
QED
Theorem isMorePrecise_refl:
!m. isMorePrecise m m
Proof
Cases_on `m` \\ EVAL_TAC \\ fs[]
QED
Theorem REAL_least_precision:
!(m:mType).
isMorePrecise m REAL ==> m = REAL
Proof
fs [isMorePrecise_def, mTypeToR_def]
\\ rpt strip_tac
\\ Cases_on `m`
\\ fs []
QED
Theorem REAL_lower_bound:
!(m:mType).
isMorePrecise REAL m
Proof
Cases_on `m` \\ EVAL_TAC
QED
(**
Function computing the join of two precisions, this is the most precise type,
in which evaluation has to be performed, e.g. addition of 32 and 64 bit floats
has to happen in 64 bits
**)
Definition join_fl_def:
join_fl (F w1 f1 s1) (F w2 f2 s2) = NONE /\
join_fl m1 m2 = if (morePrecise m1 m2) then SOME m1 else SOME m2
End
Definition join_fl3_def:
join_fl3 m1 m2 m3 = do mj <- join_fl m1 m2; join_fl mj m3; od
End
(** Since we cannot compute a join for Fixed-Points, we give a
isJoin predicate which returns true for fixed-points, but needs to inspect
it for floating-points **)
Definition isCompat_def:
isCompat (REAL) (REAL) = T /\
isCompat (F w1 f1 s1) (F w2 f2 s2) = morePrecise (F w2 f2 s2) (F w1 f1 s1) /\
isCompat (F _ _ _) _ = F /\
isCompat _ (F _ _ _) = F /\
isCompat m1 m2 = (m1 = m2)
End
Definition isJoin_def:
isJoin m1 m2 mj =
if (isFixedPoint m1 /\ isFixedPoint m2)
then morePrecise m1 mj /\ morePrecise m2 mj
else
(case join_fl m1 m2 of
|NONE => F
|SOME mNew => mNew = mj)
End
Definition isJoin3_def:
isJoin3 m1 m2 m3 mj =
if (isFixedPoint m1 /\ isFixedPoint m2 /\ isFixedPoint m3)
then morePrecise m1 mj /\ morePrecise m2 mj /\ morePrecise m3 mj
else
(case join_fl3 m1 m2 m3 of
|NONE => F
|SOME mNew => mNew = mj)
End
(**
Predicate that is true if and only if the given value v is a valid
floating-point value according to the the type m.
Since we use the 1 + 𝝳 abstraction, the value must either be
in normal range or 0.
**)
Definition validFloatValue_def:
validFloatValue (v:real) (m:mType) =
case m of
| REAL => T
| _ => normal v m \/ denormal v m \/ v = 0
End
Definition validValue_def:
validValue (v:real) (m:mType) =
case m of
| REAL => T
| _ => abs v <= maxValue m
End
Theorem no_underflow_fixed_point:
!v f w. ~ denormal v (F w f s)
Proof
fs[denormal_def, minValue_pos_def, REAL_NOT_LT, REAL_ABS_POS]
QED
val _ = export_theory();