Skip to content

Commit 0254c32

Browse files
Merge pull request #401 from sosy-lab/improve_opensmt2_parsing
Strip SMT-LIB2 queries before parsing
2 parents d808f78 + 4c2e56e commit 0254c32

16 files changed

+870
-487
lines changed

src/org/sosy_lab/java_smt/basicimpl/AbstractFormulaManager.java

Lines changed: 77 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,17 @@
1818
import com.google.common.collect.ImmutableMap;
1919
import com.google.common.collect.ImmutableSet;
2020
import com.google.common.collect.Iterables;
21+
import java.io.IOException;
2122
import java.util.Arrays;
2223
import java.util.List;
2324
import java.util.Map;
2425
import org.checkerframework.checker.nullness.qual.Nullable;
2526
import org.sosy_lab.common.Appender;
26-
import org.sosy_lab.java_smt.api.ArrayFormulaManager;
27-
import org.sosy_lab.java_smt.api.BooleanFormula;
28-
import org.sosy_lab.java_smt.api.EnumerationFormulaManager;
29-
import org.sosy_lab.java_smt.api.FloatingPointFormulaManager;
30-
import org.sosy_lab.java_smt.api.Formula;
31-
import org.sosy_lab.java_smt.api.FormulaManager;
32-
import org.sosy_lab.java_smt.api.FormulaType;
27+
import org.sosy_lab.common.Appenders;
28+
import org.sosy_lab.java_smt.api.*;
3329
import org.sosy_lab.java_smt.api.FormulaType.ArrayFormulaType;
3430
import org.sosy_lab.java_smt.api.FormulaType.BitvectorType;
3531
import org.sosy_lab.java_smt.api.FormulaType.FloatingPointType;
36-
import org.sosy_lab.java_smt.api.FunctionDeclaration;
37-
import org.sosy_lab.java_smt.api.FunctionDeclarationKind;
38-
import org.sosy_lab.java_smt.api.IntegerFormulaManager;
39-
import org.sosy_lab.java_smt.api.RationalFormulaManager;
40-
import org.sosy_lab.java_smt.api.SLFormulaManager;
41-
import org.sosy_lab.java_smt.api.StringFormulaManager;
42-
import org.sosy_lab.java_smt.api.Tactic;
4332
import org.sosy_lab.java_smt.api.visitors.FormulaTransformationVisitor;
4433
import org.sosy_lab.java_smt.api.visitors.FormulaVisitor;
4534
import org.sosy_lab.java_smt.api.visitors.TraversalProcess;
@@ -261,11 +250,83 @@ public EnumerationFormulaManager getEnumerationFormulaManager() {
261250
return enumManager;
262251
}
263252

264-
public abstract Appender dumpFormula(TFormulaInfo t);
253+
protected abstract TFormulaInfo parseImpl(String formulaStr) throws IllegalArgumentException;
254+
255+
/**
256+
* Takes a SMT-LIB2 script and cleans it up.
257+
*
258+
* <p>We remove all comments and put each command on its own line. Declarations and asserts are
259+
* kept and everything else is removed. For <code>(set-logic ..)</code> we make sure that it's at
260+
* the top of the file before removing it, and for <code>(exit)</code> we make sure that it can
261+
* only occur as the last command.
262+
*/
263+
private String sanitize(String formulaStr) {
264+
List<String> tokens = Tokenizer.tokenize(formulaStr);
265+
266+
StringBuilder builder = new StringBuilder();
267+
int pos = 0; // index of the current token
268+
269+
for (String token : tokens) {
270+
if (Tokenizer.isSetLogicToken(token)) {
271+
// Skip the (set-logic ...) command at the beginning of the input
272+
Preconditions.checkArgument(pos == 0);
273+
274+
} else if (Tokenizer.isExitToken(token)) {
275+
// Skip the (exit) command at the end of the input
276+
Preconditions.checkArgument(pos == tokens.size() - 1);
277+
278+
} else if (Tokenizer.isDeclarationToken(token)
279+
|| Tokenizer.isDefinitionToken(token)
280+
|| Tokenizer.isAssertToken(token)) {
281+
// Keep only declaration, definitions and assertion
282+
builder.append(token).append('\n');
283+
284+
} else if (Tokenizer.isForbiddenToken(token)) {
285+
// Throw an exception if the script contains commands like (pop) or (reset) that change the
286+
// state of the assertion stack.
287+
// We could keep track of the state of the stack and only consider the formulas that remain
288+
// on the stack at the end of the script. However, this does not seem worth it at the
289+
// moment. If needed, this feature can still be added later.
290+
String message;
291+
if (Tokenizer.isPushToken(token)) {
292+
message = "(push ...)";
293+
} else if (Tokenizer.isPopToken(token)) {
294+
message = "(pop ...)";
295+
} else if (Tokenizer.isResetAssertionsToken(token)) {
296+
message = "(reset-assertions)";
297+
} else if (Tokenizer.isResetToken(token)) {
298+
message = "(reset)";
299+
} else {
300+
// Should be unreachable
301+
throw new UnsupportedOperationException();
302+
}
303+
throw new IllegalArgumentException(
304+
String.format("SMTLIB command '%s' is not supported when parsing formulas.", message));
305+
306+
} else {
307+
// Remove everything else
308+
}
309+
pos++;
310+
}
311+
return builder.toString();
312+
}
313+
314+
@Override
315+
public BooleanFormula parse(String formulaStr) throws IllegalArgumentException {
316+
return formulaCreator.encapsulateBoolean(parseImpl(sanitize(formulaStr)));
317+
}
318+
319+
protected abstract String dumpFormulaImpl(TFormulaInfo t) throws IOException;
265320

266321
@Override
267322
public Appender dumpFormula(BooleanFormula t) {
268-
return dumpFormula(formulaCreator.extractInfo(t));
323+
return new Appenders.AbstractAppender() {
324+
@Override
325+
public void appendTo(Appendable out) throws IOException {
326+
String raw = dumpFormulaImpl(formulaCreator.extractInfo(t));
327+
out.append(sanitize(raw));
328+
}
329+
};
269330
}
270331

271332
@Override
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
/*
2+
* This file is part of JavaSMT,
3+
* an API wrapper for a collection of SMT solvers:
4+
* https://github.com/sosy-lab/java-smt
5+
*
6+
* SPDX-FileCopyrightText: 2024 Dirk Beyer <https://www.sosy-lab.org>
7+
*
8+
* SPDX-License-Identifier: Apache-2.0
9+
*/
10+
11+
package org.sosy_lab.java_smt.basicimpl;
12+
13+
import com.google.common.collect.ImmutableList;
14+
import java.util.List;
15+
import java.util.Optional;
16+
17+
/** Helper class for splitting up an SMT-LIB2 file into a string of commands */
18+
public class Tokenizer {
19+
/**
20+
* Split up a sequence of lisp expressions.
21+
*
22+
* <p>This is used by {@link AbstractFormulaManager#parse(String)} as part of the preprocessing
23+
* before the input is passed on to the solver. SMT-LIB2 scripts are sequences of commands that
24+
* are just r-expression. We split them up and then return the list.
25+
*
26+
* <p>As an example <code>tokenize("(define-const a Int)(assert (= a 0)")</code> will return the
27+
* sequence <code>["(define-const a Int)", "(assert (= a 0))"]</code>
28+
*/
29+
public static List<String> tokenize(String input) {
30+
ImmutableList.Builder<String> builder = ImmutableList.builder();
31+
boolean inComment = false;
32+
boolean inString = false;
33+
boolean inQuoted = false;
34+
35+
int level = 0;
36+
37+
StringBuilder token = new StringBuilder();
38+
for (int i = 0; i < input.length(); i++) {
39+
char c = input.charAt(i);
40+
if (inComment) {
41+
if (c == '\n') {
42+
// End of a comment
43+
inComment = false;
44+
if (level > 0) {
45+
// If we're in an expression we need to replace the entire comment (+ the newline) with
46+
// some whitespace. Otherwise symbols might get merged across line-wraps. This is not
47+
// a problem at the top-level where all terms are surrounded by brackets.
48+
token.append(c);
49+
}
50+
}
51+
52+
} else if (inString) {
53+
if (c == '"') {
54+
// We have a double quote: Check that it's not followed by another and actually closes
55+
// the string.
56+
Optional<Character> n =
57+
(i == input.length() - 1) ? Optional.empty() : Optional.of(input.charAt(i + 1));
58+
if (n.isEmpty() || n.orElseThrow() != '"') {
59+
// Close the string
60+
token.append(c);
61+
inString = false;
62+
} else {
63+
// Add both quotes to the token and skip one character ahead
64+
token.append(c);
65+
token.append(n.orElseThrow());
66+
i++;
67+
}
68+
} else {
69+
token.append(c);
70+
}
71+
72+
} else if (inQuoted) {
73+
if (c == '|') {
74+
// Close the quotes
75+
inQuoted = false;
76+
}
77+
if (c == '\\') {
78+
// The SMT-LIB2 standard does not allow backslash inside quoted symbols:
79+
// Throw an exception
80+
throw new IllegalArgumentException();
81+
}
82+
token.append(c);
83+
84+
} else if (c == ';') {
85+
// Start of a comment
86+
inComment = true;
87+
88+
} else if (c == '"') {
89+
// Start of a string literal
90+
inString = true;
91+
token.append(c);
92+
93+
} else if (c == '|') {
94+
// Start of a quoted symbol
95+
inQuoted = true;
96+
token.append(c);
97+
98+
} else {
99+
// Just a regular character outside of comments, quotes or string literals
100+
if (level == 0) {
101+
// We're at the top-level
102+
if (!Character.isWhitespace(c)) {
103+
if (c == '(') {
104+
// Handle opening brackets
105+
token.append("(");
106+
level++;
107+
} else {
108+
// Should be unreachable: all top-level expressions need parentheses around them
109+
throw new IllegalArgumentException();
110+
}
111+
}
112+
} else {
113+
// We're inside an r-expression
114+
token.append(c);
115+
// Handle opening/closing brackets
116+
if (c == '(') {
117+
level++;
118+
}
119+
if (c == ')') {
120+
if (level == 1) {
121+
builder.add(token.toString());
122+
token = new StringBuilder();
123+
}
124+
level--;
125+
}
126+
}
127+
}
128+
}
129+
if (level != 0) {
130+
// Throw an exception if the brackets don't match
131+
throw new IllegalArgumentException();
132+
}
133+
return builder.build();
134+
}
135+
136+
private static boolean matchesOneOf(String token, String... regexp) {
137+
return token.matches("\\(\\s*(" + String.join("|", regexp) + ")[\\S\\s]*");
138+
}
139+
140+
/**
141+
* Check if the token is <code>(set-logic ..)</code>.
142+
*
143+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
144+
*/
145+
public static boolean isSetLogicToken(String token) {
146+
return matchesOneOf(token, "set-logic");
147+
}
148+
149+
/**
150+
* Check if the token is a function or variable declaration.
151+
*
152+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
153+
*/
154+
public static boolean isDeclarationToken(String token) {
155+
return matchesOneOf(token, "declare-const", "declare-fun");
156+
}
157+
158+
/**
159+
* Check if the token is a function definition.
160+
*
161+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
162+
*/
163+
public static boolean isDefinitionToken(String token) {
164+
return matchesOneOf(token, "define-fun");
165+
}
166+
167+
/**
168+
* Check if the token is an <code>(assert ...)</code>.
169+
*
170+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
171+
*/
172+
public static boolean isAssertToken(String token) {
173+
return matchesOneOf(token, "assert");
174+
}
175+
176+
/**
177+
* Check if the token is an <code>(push ...)</code>.
178+
*
179+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
180+
*/
181+
public static boolean isPushToken(String token) {
182+
return matchesOneOf(token, "push");
183+
}
184+
185+
/**
186+
* Check if the token is an <code>(pop ...)</code>.
187+
*
188+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
189+
*/
190+
public static boolean isPopToken(String token) {
191+
return matchesOneOf(token, "pop");
192+
}
193+
194+
/**
195+
* Check if the token is an <code>(reset-assertions ...)</code>.
196+
*
197+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
198+
*/
199+
public static boolean isResetAssertionsToken(String token) {
200+
return matchesOneOf(token, "reset-assertions");
201+
}
202+
203+
/**
204+
* Check if the token is an <code>(reset)</code>.
205+
*
206+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
207+
*/
208+
public static boolean isResetToken(String token) {
209+
return matchesOneOf(token, "reset");
210+
}
211+
212+
/**
213+
* Check if the token is <code>(exit)</code>.
214+
*
215+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
216+
*/
217+
public static boolean isExitToken(String token) {
218+
return matchesOneOf(token, "exit");
219+
}
220+
221+
/**
222+
* Check if this is a forbidden token.
223+
*
224+
* <p>The list of forbidden tokens contains:
225+
*
226+
* <ul>
227+
* <li>push
228+
* <li>pop
229+
* <li>reset-assertions
230+
* <li>reset
231+
* </ul>
232+
*
233+
* <p>Forbidden tokens manipulate the stack and are not supported while parsing SMT-lIB2 string.
234+
* When a forbidden token is found parsing should be aborted by throwing an {@link
235+
* IllegalArgumentException} exception.
236+
*
237+
* <p>Use {@link #tokenize(String)} to turn an SMT-LIB2 script into a string of input tokens.
238+
*/
239+
public static boolean isForbiddenToken(String token) {
240+
return isPushToken(token)
241+
|| isPopToken(token)
242+
|| isResetAssertionsToken(token)
243+
|| isResetToken(token);
244+
}
245+
}

0 commit comments

Comments
 (0)