Skip to content

Commit 20413ba

Browse files
committed
Support common table expressions (CTEs)
- Bump minor version to `0.1.3` - Use [`sqlglot`](https://github.com/tobymao/sqlglot) for parsing
1 parent 89f2f82 commit 20413ba

File tree

5 files changed

+86
-42
lines changed

5 files changed

+86
-42
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "sinker"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "Synchronize Postgres to Elasticsearch"
55
authors = ["Loren Siebert <[email protected]>"]
66
license = "MIT/Apache-2.0"
@@ -15,6 +15,7 @@ elasticsearch = "^8.17.0"
1515
environs = ">=9.5,<15.0"
1616
psycopg = "^3.1.8"
1717
pytest-mock = "^3.10.0"
18+
sqlglot = "^26.2.1"
1819

1920
[tool.poetry.group.dev.dependencies]
2021
flake8 = ">=6,<8"

src/sinker/sinker.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
PGCHUNK_SIZE,
1818
SCHEMA_TABLE_DELIMITER,
1919
)
20-
from .utils import generate_schema_tables
20+
from .utils import parse_schema_tables
2121

2222
logger = logging.getLogger(__name__)
2323

@@ -107,23 +107,23 @@ def setup_pg(self) -> None:
107107
plpgsql: str = f"{schema_view_name}_fn"
108108
create_function: str = q.CREATE_FUNCTION.format(plpgsql, SINKER_SCHEMA, SINKER_TODO_TABLE, schema_view_name)
109109
ddl_list.append(create_function)
110-
for schema_table in generate_schema_tables(view_select_query):
110+
# The last table is the top-level table that gets DELETE events with an ID in the replication slot.
111+
# The materialized views do not contain the ID of the doc being deleted,
112+
# so we'll use this table's delete events as a proxy.
113+
# lsn,xid,data
114+
# 0/24EDA4D8,17393,BEGIN 17393
115+
# 0/24EDA4D8,17393,"table public.""Foo"": DELETE: id[text]:'91754ea9-2983-4cf7-bdf9-fc23d2386d90'"
116+
# 0/24EDC1B0,17393,COMMIT 17393
117+
# 0/24EDC228,17394,BEGIN 17394
118+
# 0/24EF0D60,17394,table sinker.foo_mv: DELETE: (no-tuple-data)
119+
# 0/24EF4718,17394,COMMIT 17394
120+
self.parent_table, schema_tables = parse_schema_tables(view_select_query)
121+
for schema_table in schema_tables:
111122
schema, _, table = schema_table.rpartition(SCHEMA_TABLE_DELIMITER)
112123
schema = schema or DEFAULT_SCHEMA
113124
trigger_name: str = f"{SINKER_SCHEMA}_{self.view}_{schema}_{table}"
114125
create_trigger: str = q.CREATE_TRIGGER.format(trigger_name, schema, table, plpgsql)
115126
ddl_list.append(create_trigger)
116-
# The last table is the top-level table that gets DELETE events with an ID in the replication slot.
117-
# The materialized views do not contain the ID of the doc being deleted,
118-
# so we'll use this table's delete events as a proxy.
119-
# lsn,xid,data
120-
# 0/24EDA4D8,17393,BEGIN 17393
121-
# 0/24EDA4D8,17393,"table public.""Foo"": DELETE: id[text]:'91754ea9-2983-4cf7-bdf9-fc23d2386d90'"
122-
# 0/24EDC1B0,17393,COMMIT 17393
123-
# 0/24EDC228,17394,BEGIN 17394
124-
# 0/24EF0D60,17394,table sinker.foo_mv: DELETE: (no-tuple-data)
125-
# 0/24EF4718,17394,COMMIT 17394
126-
self.parent_table = schema_table
127127
create_todo_entry: str = q.CREATE_TODO_ENTRY.format(SINKER_SCHEMA, SINKER_TODO_TABLE, schema_view_name)
128128
ddl_list.append(create_todo_entry)
129129
psycopg.connect(autocommit=True).execute("; ".join(ddl_list))

src/sinker/utils.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
1-
import re
1+
from typing import Set, Tuple
22

3-
from typing import Iterable
3+
import sqlglot
4+
from sqlglot.expressions import Table, CTE
45

5-
TABLE_RE = re.compile(r"from\s\"?(\S+)\b", re.I)
66

7-
8-
def generate_schema_tables(view_select_query: str) -> Iterable[str]:
7+
def parse_schema_tables(view_select_query: str) -> Tuple[str, Set[str]]:
98
"""
10-
Given a view select query, return a list of unique tables that are referenced in the query
11-
in the order they were encountered.
9+
Given a view select query, return a primary parent table and the set of unique tables that are referenced in the query.
1210
Skip anything that looks like a function call.
1311
:param view_select_query: The select query from the view
1412
"""
15-
seen: set = set()
16-
for table_candidate in TABLE_RE.findall(view_select_query):
17-
if "(" not in table_candidate:
18-
if table_candidate not in seen:
19-
seen.add(table_candidate)
20-
yield table_candidate
13+
parsed = sqlglot.parse_one(view_select_query)
14+
parent_table = parsed.find(Table).name
15+
tables = {table.name for table in parsed.find_all(Table)}
16+
ctes = {cte.alias for cte in parsed.find_all(CTE)}
17+
schema_tables = tables - ctes
18+
return parent_table, schema_tables

tests/test_generate_schema_tables.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

tests/test_parse_schema_tables.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from sinker.utils import parse_schema_tables
2+
3+
4+
def test_parse_schema_tables():
5+
view_select_query = """select id,
6+
json_build_object(
7+
'name', "name",
8+
'otherEmailDomains',(select array_agg(split_part(email, '@', 2)) FROM unnest(emails) as email),
9+
'emailDomains', (select array_agg(split_part(value, '@', 2))
10+
from "EmailAddress" EA where "personId"="Person".id),
11+
'emailAddresses', (select array_agg(value) from "EmailAddress" EA where "personId"="Person".id),
12+
) as "person"
13+
from "person"
14+
"""
15+
parent_table, schema_tables = parse_schema_tables(view_select_query)
16+
assert parent_table == "person"
17+
assert schema_tables == {"EmailAddress", "person"}
18+
19+
def test_parse_schema_tables_with_cte():
20+
view_select_query = """
21+
WITH
22+
attendees AS (
23+
SELECT DISTINCT ON (a."personId", a."hostedEventId")
24+
a."hostedEventId",
25+
a.status,
26+
e.value as email,
27+
p."primaryOrganizationId"
28+
FROM "HostedEventAttendance" a
29+
JOIN "Person" p ON a."personId" = p.id
30+
JOIN "EmailAddress" e ON p.id = e."personId"
31+
GROUP BY
32+
a."personId",
33+
a."hostedEventId",
34+
a.status,
35+
e.value,
36+
p."primaryOrganizationId"
37+
)
38+
SELECT
39+
id,
40+
json_build_object(
41+
'summary', "name",
42+
'startTime', "timestamp",
43+
'attendees', (
44+
SELECT json_agg(json_build_object('email', attendees.email, 'eventResponse', attendees.status)) AS formatted_attendees
45+
FROM attendees
46+
WHERE attendees."hostedEventId" = "HostedEvent".id
47+
),
48+
'organizationIds',
49+
(
50+
SELECT array_agg(attendees."primaryOrganizationId")
51+
FROM attendees
52+
WHERE attendees."hostedEventId" = "HostedEvent".id
53+
)
54+
) AS "hosted_events"
55+
FROM
56+
"HostedEvent"
57+
"""
58+
parent_table, schema_tables = parse_schema_tables(view_select_query)
59+
assert parent_table == "HostedEvent"
60+
assert schema_tables == {"EmailAddress", "HostedEvent", "HostedEventAttendance", "Person"}

0 commit comments

Comments
 (0)