From 5e47dcb885289937b54ab13928fb7db41b7de95c Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:15:54 +1000
Subject: [PATCH 1/6] Pull out new _extract_tokens for easier testing

---
 casbin/persist/adapter.py     | 17 ++++++++---
 tests/persist/test_adapter.py | 53 +++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 tests/persist/test_adapter.py

diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
index 2c9b9a38..f4c69bdc 100644
--- a/casbin/persist/adapter.py
+++ b/casbin/persist/adapter.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 
-def load_policy_line(line, model):
-    """loads a text line as a policy rule to model."""
+def _extract_tokens(line):
+    """Return the list of 'tokens' from the line, or None if this line has none"""
 
     if line == "":
-        return
+        return None
 
     if line[:1] == "#":
-        return
+        return None
 
     stack = []
     tokens = []
@@ -40,6 +40,15 @@ def load_policy_line(line, model):
                 tokens[-1] += c
 
     tokens = [x.strip() for x in tokens]
+    return tokens
+
+
+def load_policy_line(line, model):
+    """loads a text line as a policy rule to model."""
+
+    tokens = _extract_tokens(line)
+    if tokens is None:
+        return
 
     key = tokens[0]
     sec = key[0]
diff --git a/tests/persist/test_adapter.py b/tests/persist/test_adapter.py
new file mode 100644
index 00000000..c6028544
--- /dev/null
+++ b/tests/persist/test_adapter.py
@@ -0,0 +1,53 @@
+from casbin.persist.adapter import _extract_tokens
+from tests import TestCaseBase
+
+
+class TestExtractTokens(TestCaseBase):
+    def test_ignore_lines(self):
+        self.assertIsNone(_extract_tokens(""))  # empty
+        self.assertIsNone(_extract_tokens("# comment"))
+
+    def test_simple_lines(self):
+        # split on top-level commas, strip whitespace from start and end
+        self.assertEqual(_extract_tokens("one"), ["one"])
+        self.assertEqual(_extract_tokens("one,two"), ["one", "two"])
+        self.assertEqual(_extract_tokens("   ignore  \t,\t   external, spaces  "), ["ignore", "external", "spaces"])
+
+        self.assertEqual(_extract_tokens("internal spaces preserved"), ["internal spaces preserved"])
+
+    def test_nested_lines(self):
+        # basic nesting within a single token
+        self.assertEqual(
+            _extract_tokens("outside1()"),
+            ["outside1()"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1())"),
+            ["outside1(inside1())"],
+        )
+
+        # split on top-level commas, but not on internal ones
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2())"),
+            ["outside1(inside1(), inside2())"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2(inside3(), inside4()))"),
+            ["outside1(inside1(), inside2(inside3(), inside4()))"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2()), outside2(inside3(), inside4())"),
+            ["outside1(inside1(), inside2())", "outside2(inside3(), inside4())"],
+        )
+
+        # different delimiters
+        self.assertEqual(
+            _extract_tokens(
+                "all_square[inside1[], inside2[]],square_and_parens[inside1(), inside2()],parens_and_square(inside1[], inside2[])"
+            ),
+            [
+                "all_square[inside1[], inside2[]]",
+                "square_and_parens[inside1(), inside2()]",
+                "parens_and_square(inside1[], inside2[])",
+            ],
+        )

From f063e784b69b7909896ccb78f27cb5e2ffd9b027 Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:26:29 +1000
Subject: [PATCH 2/6] [0002_original] Add a benchmark too:

-------------------------------------------------------------------------------------------------- benchmark: 4 tests --------------------------------------------------------------------------------------------------
Name (time in us)                                  Min                    Max               Mean             StdDev             Median               IQR            Outliers  OPS (Kops/s)            Rounds  Iterations
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_benchmark_extract_tokens_short_simple      1.4580 (1.0)          58.3340 (1.0)       1.5731 (1.0)       0.5280 (1.0)       1.5420 (1.0)      0.0411 (1.0)        53;455      635.6819 (1.0)       22305           1
test_benchmark_extract_tokens_short_nested      3.0829 (2.11)     13,417.2500 (230.01)    3.8216 (2.43)     49.5247 (93.80)     3.2500 (2.11)     0.2091 (5.09)      60;3604      261.6736 (0.41)     116509           1
test_benchmark_extract_tokens_long_simple      13.5830 (9.32)         95.4170 (1.64)     14.0610 (8.94)      1.0361 (1.96)     13.7920 (8.94)     0.2080 (5.06)   3089;10664       71.1189 (0.11)      57555           1
test_benchmark_extract_tokens_long_nested      26.5830 (18.23)        71.4579 (1.22)     27.2121 (17.30)     0.8780 (1.66)     27.1250 (17.59)    0.0841 (2.05)     566;2007       36.7484 (0.06)      30457           1
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---
 .github/workflows/build.yml           |  1 +
 tests/benchmarks/benchmark_adapter.py | 30 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 tests/benchmarks/benchmark_adapter.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3e63bad..517a6e00 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,6 +39,7 @@ jobs:
           tests/benchmarks/benchmark_model.py
           tests/benchmarks/benchmark_management_api.py
           tests/benchmarks/benchmark_role_manager.py
+          tests/benchmarks/benchmark_adapter.py
 
       - name: Upload coverage data to coveralls.io
         run: coveralls --service=github
diff --git a/tests/benchmarks/benchmark_adapter.py b/tests/benchmarks/benchmark_adapter.py
new file mode 100644
index 00000000..3a094662
--- /dev/null
+++ b/tests/benchmarks/benchmark_adapter.py
@@ -0,0 +1,30 @@
+from casbin.persist.adapter import _extract_tokens
+
+
+def _benchmark_extract_tokens(benchmark, line):
+    @benchmark
+    def run_benchmark():
+        _extract_tokens(line)
+
+
+def test_benchmark_extract_tokens_short_simple(benchmark):
+    _benchmark_extract_tokens(benchmark, "abc,def,ghi")
+
+
+def test_benchmark_extract_tokens_long_simple(benchmark):
+    # fixed UUIDs for length and to be similar to "real world" usage of UUIDs
+    _benchmark_extract_tokens(
+        benchmark,
+        "00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-000000000001,00000000-0000-0000-0000-000000000002",
+    )
+
+
+def test_benchmark_extract_tokens_short_nested(benchmark):
+    _benchmark_extract_tokens(benchmark, "abc(def,ghi),jkl(mno,pqr)")
+
+
+def test_benchmark_extract_tokens_long_nested(benchmark):
+    _benchmark_extract_tokens(
+        benchmark,
+        "00000000-0000-0000-0000-000000000000(00000000-0000-0000-0000-000000000001,00000000-0000-0000-0000-000000000002),00000000-0000-0000-0000-000000000003(00000000-0000-0000-0000-000000000004,00000000-0000-0000-0000-000000000005)",
+    )

From 6283d4fdcc6c001e5a6c493fce2c1e9df5165d41 Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:44:11 +1000
Subject: [PATCH 3/6] [0003_finditer] re.finditer based solution

-------------------------------------------------------------------------------------------------------- benchmark: 4 tests -------------------------------------------------------------------------------------------------------
Name (time in ns)                                     Min                    Max                  Mean              StdDev                Median                IQR            Outliers  OPS (Kops/s)            Rounds  Iterations
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_benchmark_extract_tokens_short_simple       999.8912 (1.0)      18,417.0203 (1.0)      1,123.3050 (1.0)      207.9197 (1.0)      1,124.9213 (1.0)      42.0259 (1.0)        97;670      890.2302 (1.0)       26667           1
test_benchmark_extract_tokens_long_simple      1,415.9596 (1.42)     45,624.9109 (2.48)     1,562.9986 (1.39)     223.0803 (1.07)     1,542.0374 (1.37)     42.0259 (1.0)      620;6094      639.7958 (0.72)     196733           1
test_benchmark_extract_tokens_short_nested     1,749.9551 (1.75)     31,417.0029 (1.71)     1,940.4944 (1.73)     295.1896 (1.42)     1,917.0111 (1.70)     42.0259 (1.0)      348;4660      515.3326 (0.58)     115943           1
test_benchmark_extract_tokens_long_nested      2,583.9545 (2.58)     33,583.0264 (1.82)     2,777.7344 (2.47)     224.8759 (1.08)     2,750.0791 (2.44)     42.0259 (1.0)      374;5402      360.0056 (0.40)     117082           1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---
 casbin/persist/adapter.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
index f4c69bdc..fd3309f6 100644
--- a/casbin/persist/adapter.py
+++ b/casbin/persist/adapter.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
+
+_INTERESTING_TOKENS_RE = re.compile(r"[,\[\]\(\)]")
+
 
 def _extract_tokens(line):
     """Return the list of 'tokens' from the line, or None if this line has none"""
@@ -24,20 +28,29 @@ def _extract_tokens(line):
 
     stack = []
     tokens = []
-    for c in line:
+
+    # The tokens are separated by commas, but we support nesting so a naive `line.split(",")` is
+    # wrong. E.g. `abc(def, ghi), jkl` is two tokens: `abc(def, ghi)` and `jkl`. We do this by
+    # iterating over the locations of any tokens of interest, and either:
+    #
+    # - [](): adjust the nesting depth
+    # - ,: slice the line to save the token, if the , is at the top-level, outside all []()
+    #
+    # `start_idx` represents the start of the current token, that we haven't seen a `,` for yet.
+    start_idx = 0
+    for match in _INTERESTING_TOKENS_RE.finditer(line):
+        c = match.group()
         if c == "[" or c == "(":
             stack.append(c)
-            tokens[-1] += c
         elif c == "]" or c == ")":
             stack.pop()
-            tokens[-1] += c
         elif c == "," and len(stack) == 0:
-            tokens.append("")
-        else:
-            if len(tokens) == 0:
-                tokens.append(c)
-            else:
-                tokens[-1] += c
+            # we've found the end of a top level token so save that and start a new one
+            tokens.append(line[start_idx : match.start()])
+            start_idx = match.end()
+
+    # trailing token after the last ,
+    tokens.append(line[start_idx:])
 
     tokens = [x.strip() for x in tokens]
     return tokens

From 18b884b0e40fdd786a2c421b214e6b83d1a0b2d4 Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:53:48 +1000
Subject: [PATCH 4/6] [0006_implicit_comma] `c == ","` is implied, omit it:

---------------------------------------------------------------------------------------------------------- benchmark: 4 tests ---------------------------------------------------------------------------------------------------------
Name (time in ns)                                     Min                     Max                  Mean                StdDev                Median                 IQR            Outliers  OPS (Kops/s)            Rounds  Iterations
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_benchmark_extract_tokens_short_simple       957.9817 (1.0)       20,832.9875 (1.0)      1,106.7959 (1.0)        167.6173 (1.0)      1,084.0595 (1.0)       42.0259 (1.0)       100;508      903.5089 (1.0)       29412           1
test_benchmark_extract_tokens_long_simple      1,374.9814 (1.44)      54,500.0657 (2.62)     1,526.7156 (1.38)       224.1555 (1.34)     1,500.0114 (1.38)      42.0259 (1.0)      511;7264      655.0008 (0.72)     187513           1
test_benchmark_extract_tokens_short_nested     1,707.9292 (1.78)     119,416.9745 (5.73)     1,897.0402 (1.71)       592.5052 (3.53)     1,874.9852 (1.73)      83.9354 (2.00)     458;6029      527.1370 (0.58)     166666           1
test_benchmark_extract_tokens_long_nested      2,500.0190 (2.61)     197,916.0588 (9.50)     2,773.7646 (2.51)     1,057.4522 (6.31)     2,707.9368 (2.50)     166.9396 (3.97)     509;2386      360.5209 (0.40)     116511           1
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---
 casbin/persist/adapter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
index fd3309f6..c56041d4 100644
--- a/casbin/persist/adapter.py
+++ b/casbin/persist/adapter.py
@@ -44,8 +44,9 @@ def _extract_tokens(line):
             stack.append(c)
         elif c == "]" or c == ")":
             stack.pop()
-        elif c == "," and len(stack) == 0:
-            # we've found the end of a top level token so save that and start a new one
+        elif len(stack) == 0:
+            # must be a comma outside of any nesting: we've found the end of a top level token so
+            # save that and start a new one
             tokens.append(line[start_idx : match.start()])
             start_idx = match.end()
 

From aa5d8386f977512ef21a37d072153e1f8f329d91 Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:56:30 +1000
Subject: [PATCH 5/6] [0007_no_len] use `not stack` instead of `len(stack) ==
 0`

------------------------------------------------------------------------------------------------------------ benchmark: 4 tests ------------------------------------------------------------------------------------------------------------
Name (time in ns)                                     Min                        Max                  Mean                  StdDev                Median                 IQR            Outliers  OPS (Kops/s)            Rounds  Iterations
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_benchmark_extract_tokens_short_simple       917.0035 (1.0)          30,542.0253 (1.0)      1,116.1783 (1.0)          703.1331 (3.24)     1,083.0117 (1.0)       41.9095 (1.0)       228;817      895.9142 (1.0)       35346           1
test_benchmark_extract_tokens_long_simple      1,332.9554 (1.45)         46,208.0352 (1.51)     1,483.3709 (1.33)         216.9831 (1.0)      1,459.0332 (1.35)      42.0259 (1.00)     820;5565      674.1402 (0.75)     183217           1
test_benchmark_extract_tokens_short_nested     1,624.9251 (1.77)        185,417.0114 (6.07)     1,924.2939 (1.72)       1,860.9511 (8.58)     1,792.0975 (1.65)     167.0560 (3.99)     722;1138      519.6711 (0.58)      89214           1
test_benchmark_extract_tokens_long_nested      2,417.0149 (2.64)     16,447,208.9382 (538.51)   3,852.3560 (3.45)     100,246.2894 (462.00)   2,707.9368 (2.50)     208.0342 (4.96)       20;836      259.5814 (0.29)      51283           1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---
 casbin/persist/adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
index c56041d4..733889b4 100644
--- a/casbin/persist/adapter.py
+++ b/casbin/persist/adapter.py
@@ -44,7 +44,7 @@ def _extract_tokens(line):
             stack.append(c)
         elif c == "]" or c == ")":
             stack.pop()
-        elif len(stack) == 0:
+        elif not stack:
             # must be a comma outside of any nesting: we've found the end of a top level token so
             # save that and start a new one
             tokens.append(line[start_idx : match.start()])

From 6d67769a599dd26e2677d9496a6e9e39d7083a44 Mon Sep 17 00:00:00 2001
From: Huon Wilson <huon@exoflare.io>
Date: Tue, 27 Aug 2024 09:57:27 +1000
Subject: [PATCH 6/6] [0008_inline_strip] Strip inline, don't do a second
 iteration

--------------------------------------------------------------------------------------------------------- benchmark: 4 tests ---------------------------------------------------------------------------------------------------------
Name (time in ns)                                     Min                     Max                  Mean                StdDev                Median                IQR            Outliers  OPS (Kops/s)            Rounds  Iterations
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_benchmark_extract_tokens_short_simple       790.9257 (1.0)       25,041.9835 (1.00)       965.2633 (1.0)        690.2278 (5.36)       917.0035 (1.0)      42.0259 (1.02)      261;825    1,035.9868 (1.0)       55814           1
test_benchmark_extract_tokens_long_simple      1,207.9254 (1.53)     568,790.9434 (22.75)    1,424.0680 (1.48)     1,607.7592 (12.49)    1,415.9596 (1.54)     83.0041 (2.02)     544;6540      702.2137 (0.68)     179118           1
test_benchmark_extract_tokens_short_nested     1,499.8950 (1.90)      24,999.9575 (1.0)      1,632.3808 (1.69)       136.4377 (1.06)     1,625.0415 (1.77)     41.0946 (1.0)     1290;4671      612.6022 (0.59)     146349           1
test_benchmark_extract_tokens_long_nested      2,291.0535 (2.90)      28,708.0184 (1.15)     2,454.3364 (2.54)       128.6998 (1.0)      2,457.9931 (2.68)     42.0259 (1.02)    1624;8542      407.4421 (0.39)     146349           1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---
 casbin/persist/adapter.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
index 733889b4..b5c457d5 100644
--- a/casbin/persist/adapter.py
+++ b/casbin/persist/adapter.py
@@ -47,13 +47,12 @@ def _extract_tokens(line):
         elif not stack:
             # must be a comma outside of any nesting: we've found the end of a top level token so
             # save that and start a new one
-            tokens.append(line[start_idx : match.start()])
+            tokens.append(line[start_idx : match.start()].strip())
             start_idx = match.end()
 
     # trailing token after the last ,
-    tokens.append(line[start_idx:])
+    tokens.append(line[start_idx:].strip())
 
-    tokens = [x.strip() for x in tokens]
     return tokens