Skip to content

Commit 223ba52

Browse files
Add re.dotall flag to RegexOperations (#1104)
* Add dotall flag for regexOperations, and add support for ilike/not ilike * Add tests for ilike, more tests for like/similar to with \n strings * Unskip postgres string tests + add a few tests * Add not similar to/like/ilike tests * Simplify match flags expression Co-authored-by: Charles Blackmon-Luca <[email protected]> --------- Co-authored-by: Charles Blackmon-Luca <[email protected]>
1 parent 4ffded1 commit 223ba52

File tree

4 files changed

+114
-35
lines changed

4 files changed

+114
-35
lines changed

dask_sql/physical/rex/core/call.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -434,31 +434,35 @@ def regex(self, test: SeriesOrScalar, regex: str, rex=None) -> SeriesOrScalar:
434434
transformed_regex = "^" + transformed_regex + "$"
435435

436436
# Finally, apply the string
437+
flags = re.DOTALL | re.IGNORECASE if not self.case_sensitive else re.DOTALL
437438
if is_frame(test):
438-
return test.str.match(transformed_regex).astype("boolean")
439+
return test.str.match(transformed_regex, flags=flags).astype("boolean")
439440
else:
440-
return bool(re.match(transformed_regex, test))
441+
return bool(re.match(transformed_regex, test, flags=flags))
441442

442443

443444
class LikeOperation(RegexOperation):
444-
replacement_chars = [
445-
"#",
446-
"$",
447-
"^",
448-
".",
449-
"|",
450-
"~",
451-
"-",
452-
"+",
453-
"*",
454-
"?",
455-
"(",
456-
")",
457-
"{",
458-
"}",
459-
"[",
460-
"]",
461-
]
445+
def __init__(self, case_sensitive: bool = True):
446+
self.case_sensitive = case_sensitive
447+
self.replacement_chars = [
448+
"#",
449+
"$",
450+
"^",
451+
".",
452+
"|",
453+
"~",
454+
"-",
455+
"+",
456+
"*",
457+
"?",
458+
"(",
459+
")",
460+
"{",
461+
"}",
462+
"[",
463+
"]",
464+
]
465+
super().__init__()
462466

463467

464468
class SimilarOperation(RegexOperation):
@@ -470,6 +474,7 @@ class SimilarOperation(RegexOperation):
470474
"~",
471475
"-",
472476
]
477+
case_sensitive = True
473478

474479

475480
class PositionOperation(Operation):
@@ -997,8 +1002,11 @@ class RexCallPlugin(BaseRexPlugin):
9971002
# special operations
9981003
"cast": CastOperation(),
9991004
"case": CaseOperation(),
1000-
"not like": NotOperation().of(LikeOperation()),
1001-
"like": LikeOperation(),
1005+
"not like": NotOperation().of(LikeOperation(case_sensitive=True)),
1006+
"like": LikeOperation(case_sensitive=True),
1007+
"not ilike": NotOperation().of(LikeOperation(case_sensitive=False)),
1008+
"ilike": LikeOperation(case_sensitive=False),
1009+
"not similar to": NotOperation().of(SimilarOperation()),
10021010
"similar to": SimilarOperation(),
10031011
"negative": NegativeOperation(),
10041012
"not": NotOperation(),

tests/integration/fixtures.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,16 @@ def user_table_nan():
9292

9393
@pytest.fixture()
9494
def string_table():
95-
return pd.DataFrame({"a": ["a normal string", "%_%", "^|()-*[]$"]})
95+
return pd.DataFrame(
96+
{
97+
"a": [
98+
"a normal string",
99+
"%_%",
100+
"^|()-*[]$",
101+
"^|()-*[]$\n%_%\na normal string",
102+
]
103+
}
104+
)
96105

97106

98107
@pytest.fixture()

tests/integration/test_postgres.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@ def test_filter(assert_query_gives_same_result):
221221
)
222222

223223

224-
@pytest.mark.xfail(reason="WIP DataFusion")
225224
def test_string_operations(assert_query_gives_same_result):
226225
assert_query_gives_same_result(
227226
"""
@@ -232,27 +231,28 @@ def test_string_operations(assert_query_gives_same_result):
232231
s SIMILAR TO '%%(B|c)%%',
233232
s SIMILAR TO '%%[a-zA-Z]%%',
234233
s SIMILAR TO '.*',
234+
s NOT SIMILAR TO '.*',
235235
s LIKE '%%(b|d)%%',
236236
s LIKE '%%(B|c)%%',
237237
s LIKE '%%[a-zA-Z]%%',
238238
s LIKE '.*',
239+
S NOT LIKE '.*',
240+
s ILIKE '%%(b|d)%%',
241+
s ILIKE '%%(B|c)%%',
242+
s NOT ILIKE '%%(b|d)%%',
243+
s NOT ILIKE '%%(B|c)%%',
239244
CHAR_LENGTH(s),
240245
UPPER(s),
241246
LOWER(s),
242-
POSITION('a' IN s),
243-
POSITION('ZL' IN s),
244247
TRIM('a' FROM s),
245248
TRIM(BOTH 'a' FROM s),
246249
TRIM(LEADING 'a' FROM s),
247250
TRIM(TRAILING 'a' FROM s),
248-
OVERLAY(s PLACING 'XXX' FROM 2),
249-
OVERLAY(s PLACING 'XXX' FROM 2 FOR 4),
250-
OVERLAY(s PLACING 'XXX' FROM 2 FOR 1),
251251
SUBSTRING(s FROM -1),
252252
SUBSTRING(s FROM 10),
253253
SUBSTRING(s FROM 2),
254254
SUBSTRING(s FROM 2 FOR 2),
255-
SUBSTR(s,2,2),
255+
SUBSTR(s,2,2) as s2,
256256
INITCAP(s),
257257
INITCAP(UPPER(s)),
258258
INITCAP(LOWER(s))
@@ -261,6 +261,31 @@ def test_string_operations(assert_query_gives_same_result):
261261
)
262262

263263

264+
@pytest.mark.xfail(reason="POSITION syntax not supported by parser")
265+
def test_string_position(assert_query_gives_same_result):
266+
assert_query_gives_same_result(
267+
"""
268+
SELECT
269+
POSITION('a' IN s),
270+
POSITION('ZL' IN s)
271+
FROM df3
272+
"""
273+
)
274+
275+
276+
@pytest.mark.xfail(reason="OVERLAY syntax not supported by parser")
277+
def test_string_overlay(assert_query_gives_same_result):
278+
assert_query_gives_same_result(
279+
"""
280+
SELECT
281+
OVERLAY(s PLACING 'XXX' FROM 2),
282+
OVERLAY(s PLACING 'XXX' FROM 2 FOR 4),
283+
OVERLAY(s PLACING 'XXX' FROM 2 FOR 1)
284+
FROM df3
285+
"""
286+
)
287+
288+
264289
@pytest.mark.xfail(reason="WIP DataFusion")
265290
def test_statistical_functions(assert_query_gives_same_result):
266291

tests/integration/test_rex.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,18 +258,55 @@ def test_like(c, input_table, gpu, request):
258258
WHERE a SIMILAR TO '%n[a-z]rmal st_i%'
259259
"""
260260
)
261+
assert_eq(df, string_table.iloc[[0, 3]])
261262

262-
assert_eq(df, string_table.iloc[[0]])
263+
df = c.sql(
264+
f"""
265+
SELECT * FROM {input_table}
266+
WHERE a NOT SIMILAR TO '%n[a-z]rmal st_i%'
267+
"""
268+
)
269+
assert_eq(df, string_table.iloc[[1, 2]])
263270

264271
df = c.sql(
265272
f"""
266273
SELECT * FROM {input_table}
267274
WHERE a LIKE '%n[a-z]rmal st_i%'
268275
"""
269276
)
277+
assert len(df) == 0
278+
279+
df = c.sql(
280+
f"""
281+
SELECT * FROM {input_table}
282+
WHERE a NOT LIKE '%n[a-z]rmal st_i%'
283+
"""
284+
)
285+
assert_eq(df, string_table)
270286

287+
df = c.sql(
288+
f"""
289+
SELECT * FROM {input_table}
290+
WHERE a LIKE '%a Normal String%'
291+
"""
292+
)
271293
assert len(df) == 0
272294

295+
df = c.sql(
296+
f"""
297+
SELECT * FROM {input_table}
298+
WHERE a ILIKE '%a Normal String%'
299+
"""
300+
)
301+
assert_eq(df, string_table.iloc[[0, 3]])
302+
303+
df = c.sql(
304+
f"""
305+
SELECT * FROM {input_table}
306+
WHERE a NOT ILIKE '%a Normal String%'
307+
"""
308+
)
309+
assert_eq(df, string_table.iloc[[1, 2]])
273310
# TODO: uncomment when sqlparser adds parsing support for non-standard escape characters
274311
# https://github.com/dask-contrib/dask-sql/issues/754
275312
# df = c.sql(
@@ -288,7 +325,7 @@ def test_like(c, input_table, gpu, request):
288325
"""
289326
)
290327

291-
assert_eq(df, string_table.iloc[[2]])
328+
assert_eq(df, string_table.iloc[[2, 3]])
292329

293330
df = c.sql(
294331
f"""
@@ -345,10 +382,10 @@ def test_null(c):
345382
"""
346383
)
347384

348-
expected_df = pd.DataFrame(index=[0, 1, 2])
349-
expected_df["nn"] = [True, True, True]
385+
expected_df = pd.DataFrame(index=[0, 1, 2, 3])
386+
expected_df["nn"] = [True, True, True, True]
350387
expected_df["nn"] = expected_df["nn"].astype("boolean")
351-
expected_df["n"] = [False, False, False]
388+
expected_df["n"] = [False, False, False, False]
352389
assert_eq(df, expected_df)
353390

354391

0 commit comments

Comments
 (0)