Add re.dotall flag to RegexOperations (#1104)

ayushdg · charlesbluca · web-flow · commit 223ba52a64d3 · 2023-04-12T23:47:48.000-04:00
* Add dotall flag for regexOperations, and add support for ilike/not ilike

* Add tests for ilike, more tests for like/similar to with \n strings

* Unskip postgres string tests + add a few tests

* Add not similar to/like/ilike tests

* Simplify match flags expression

Co-authored-by: Charles Blackmon-Luca &lt;20627856+charlesbluca@users.noreply.github.com&gt;

---------

Co-authored-by: Charles Blackmon-Luca &lt;20627856+charlesbluca@users.noreply.github.com&gt;
diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py
@@ -434,31 +434,35 @@ def regex(self, test: SeriesOrScalar, regex: str, rex=None) -> SeriesOrScalar:
         transformed_regex = "^" + transformed_regex + "$"
 
         # Finally, apply the string
+        flags = re.DOTALL | re.IGNORECASE if not self.case_sensitive else re.DOTALL
         if is_frame(test):
-            return test.str.match(transformed_regex).astype("boolean")
+            return test.str.match(transformed_regex, flags=flags).astype("boolean")
         else:
-            return bool(re.match(transformed_regex, test))
+            return bool(re.match(transformed_regex, test, flags=flags))
 
 
 class LikeOperation(RegexOperation):
-    replacement_chars = [
-        "#",
-        "$",
-        "^",
-        ".",
-        "|",
-        "~",
-        "-",
-        "+",
-        "*",
-        "?",
-        "(",
-        ")",
-        "{",
-        "}",
-        "[",
-        "]",
-    ]
+    def __init__(self, case_sensitive: bool = True):
+        self.case_sensitive = case_sensitive
+        self.replacement_chars = [
+            "#",
+            "$",
+            "^",
+            ".",
+            "|",
+            "~",
+            "-",
+            "+",
+            "*",
+            "?",
+            "(",
+            ")",
+            "{",
+            "}",
+            "[",
+            "]",
+        ]
+        super().__init__()
 
 
 class SimilarOperation(RegexOperation):
@@ -470,6 +474,7 @@ class SimilarOperation(RegexOperation):
         "~",
         "-",
     ]
+    case_sensitive = True
 
 
 class PositionOperation(Operation):
@@ -997,8 +1002,11 @@ class RexCallPlugin(BaseRexPlugin):
         # special operations
         "cast": CastOperation(),
         "case": CaseOperation(),
-        "not like": NotOperation().of(LikeOperation()),
-        "like": LikeOperation(),
+        "not like": NotOperation().of(LikeOperation(case_sensitive=True)),
+        "like": LikeOperation(case_sensitive=True),
+        "not ilike": NotOperation().of(LikeOperation(case_sensitive=False)),
+        "ilike": LikeOperation(case_sensitive=False),
+        "not similar to": NotOperation().of(SimilarOperation()),
         "similar to": SimilarOperation(),
         "negative": NegativeOperation(),
         "not": NotOperation(),
diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py
@@ -92,7 +92,16 @@ def user_table_nan():
 
 @pytest.fixture()
 def string_table():
-    return pd.DataFrame({"a": ["a normal string", "%_%", "^|()-*[]$"]})
+    return pd.DataFrame(
+        {
+            "a": [
+                "a normal string",
+                "%_%",
+                "^|()-*[]$",
+                "^|()-*[]$\n%_%\na normal string",
+            ]
+        }
+    )
 
 
 @pytest.fixture()
diff --git a/tests/integration/test_postgres.py b/tests/integration/test_postgres.py
@@ -221,7 +221,6 @@ def test_filter(assert_query_gives_same_result):
     )
 
 
-@pytest.mark.xfail(reason="WIP DataFusion")
 def test_string_operations(assert_query_gives_same_result):
     assert_query_gives_same_result(
         """
@@ -232,27 +231,28 @@ def test_string_operations(assert_query_gives_same_result):
             s SIMILAR TO '%%(B|c)%%',
             s SIMILAR TO '%%[a-zA-Z]%%',
             s SIMILAR TO '.*',
+            s NOT SIMILAR TO '.*',
             s LIKE '%%(b|d)%%',
             s LIKE '%%(B|c)%%',
             s LIKE '%%[a-zA-Z]%%',
             s LIKE '.*',
+            S NOT LIKE '.*',
+            s ILIKE '%%(b|d)%%',
+            s ILIKE '%%(B|c)%%',
+            s NOT ILIKE '%%(b|d)%%',
+            s NOT ILIKE '%%(B|c)%%',
             CHAR_LENGTH(s),
             UPPER(s),
             LOWER(s),
-            POSITION('a' IN s),
-            POSITION('ZL' IN s),
             TRIM('a' FROM s),
             TRIM(BOTH 'a' FROM s),
             TRIM(LEADING 'a' FROM s),
             TRIM(TRAILING 'a' FROM s),
-            OVERLAY(s PLACING 'XXX' FROM 2),
-            OVERLAY(s PLACING 'XXX' FROM 2 FOR 4),
-            OVERLAY(s PLACING 'XXX' FROM 2 FOR 1),
             SUBSTRING(s FROM -1),
             SUBSTRING(s FROM 10),
             SUBSTRING(s FROM 2),
             SUBSTRING(s FROM 2 FOR 2),
-            SUBSTR(s,2,2),
+            SUBSTR(s,2,2) as s2,
             INITCAP(s),
             INITCAP(UPPER(s)),
             INITCAP(LOWER(s))
@@ -261,6 +261,31 @@ def test_string_operations(assert_query_gives_same_result):
     )
 
 
+@pytest.mark.xfail(reason="POSITION syntax not supported by parser")
+def test_string_position(assert_query_gives_same_result):
+    assert_query_gives_same_result(
+        """
+        SELECT
+            POSITION('a' IN s),
+            POSITION('ZL' IN s)
+        FROM df3
+    """
+    )
+
+
+@pytest.mark.xfail(reason="OVERLAY syntax not supported by parser")
+def test_string_overlay(assert_query_gives_same_result):
+    assert_query_gives_same_result(
+        """
+        SELECT
+            OVERLAY(s PLACING 'XXX' FROM 2),
+            OVERLAY(s PLACING 'XXX' FROM 2 FOR 4),
+            OVERLAY(s PLACING 'XXX' FROM 2 FOR 1)
+        FROM df3
+    """
+    )
+
+
 @pytest.mark.xfail(reason="WIP DataFusion")
 def test_statistical_functions(assert_query_gives_same_result):
 
diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
@@ -258,18 +258,55 @@ def test_like(c, input_table, gpu, request):
         WHERE a SIMILAR TO '%n[a-z]rmal st_i%'
     """
     )
+    assert_eq(df, string_table.iloc[[0, 3]])
 
-    assert_eq(df, string_table.iloc[[0]])
+    df = c.sql(
+        f"""
+        SELECT * FROM {input_table}
+        WHERE a NOT SIMILAR TO '%n[a-z]rmal st_i%'
+    """
+    )
+    assert_eq(df, string_table.iloc[[1, 2]])
 
     df = c.sql(
         f"""
         SELECT * FROM {input_table}
         WHERE a LIKE '%n[a-z]rmal st_i%'
     """
     )
+    assert len(df) == 0
+
+    df = c.sql(
+        f"""
+        SELECT * FROM {input_table}
+        WHERE a NOT LIKE '%n[a-z]rmal st_i%'
+    """
+    )
+    assert_eq(df, string_table)
 
+    df = c.sql(
+        f"""
+        SELECT * FROM {input_table}
+        WHERE a LIKE '%a Normal String%'
+    """
+    )
     assert len(df) == 0
 
+    df = c.sql(
+        f"""
+        SELECT * FROM {input_table}
+        WHERE a ILIKE '%a Normal String%'
+    """
+    )
+    assert_eq(df, string_table.iloc[[0, 3]])
+
+    df = c.sql(
+        f"""
+        SELECT * FROM {input_table}
+        WHERE a NOT ILIKE '%a Normal String%'
+    """
+    )
+    assert_eq(df, string_table.iloc[[1, 2]])
     # TODO: uncomment when sqlparser adds parsing support for non-standard escape characters
     # https://github.com/dask-contrib/dask-sql/issues/754
     # df = c.sql(
@@ -288,7 +325,7 @@ def test_like(c, input_table, gpu, request):
         """
     )
 
-    assert_eq(df, string_table.iloc[[2]])
+    assert_eq(df, string_table.iloc[[2, 3]])
 
     df = c.sql(
         f"""
@@ -345,10 +382,10 @@ def test_null(c):
     """
     )
 
-    expected_df = pd.DataFrame(index=[0, 1, 2])
-    expected_df["nn"] = [True, True, True]
+    expected_df = pd.DataFrame(index=[0, 1, 2, 3])
+    expected_df["nn"] = [True, True, True, True]
     expected_df["nn"] = expected_df["nn"].astype("boolean")
-    expected_df["n"] = [False, False, False]
+    expected_df["n"] = [False, False, False, False]
     assert_eq(df, expected_df)