Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 537d73b

Browse files
authored
Merge pull request #823 from datafold/fix-numeric-precision-recognition-bq-pg
Fix precision recognition
2 parents a2c64ac + 8d1388a commit 537d73b

File tree

6 files changed

+104
-19
lines changed

6 files changed

+104
-19
lines changed

data_diff/databases/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class BaseDialect(abc.ABC):
202202
SUPPORTS_INDEXES: ClassVar[bool] = False
203203
PREVENT_OVERFLOW_WHEN_CONCAT: ClassVar[bool] = False
204204
TYPE_CLASSES: ClassVar[Dict[str, Type[ColType]]] = {}
205+
DEFAULT_NUMERIC_PRECISION: ClassVar[int] = 0 # effective precision when type is just "NUMERIC"
205206

206207
PLACEHOLDER_TABLE = None # Used for Oracle
207208

data_diff/databases/bigquery.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ class Dialect(BaseDialect):
7676
}
7777
TYPE_ARRAY_RE = re.compile(r"ARRAY<(.+)>")
7878
TYPE_STRUCT_RE = re.compile(r"STRUCT<(.+)>")
79+
# [BIG]NUMERIC, [BIG]NUMERIC(precision, scale), [BIG]NUMERIC(precision)
80+
TYPE_NUMERIC_RE = re.compile(r"^((BIG)?NUMERIC)(?:\((\d+)(?:, (\d+))?\))?$")
81+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#parameterized_decimal_type
82+
# The default scale is 9, which means a number can have up to 9 digits after the decimal point.
83+
DEFAULT_NUMERIC_PRECISION = 9
7984

8085
def random(self) -> str:
8186
return "RAND()"
@@ -94,21 +99,43 @@ def type_repr(self, t) -> str:
9499

95100
def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
96101
col_type = super().parse_type(table_path, info)
97-
if isinstance(col_type, UnknownColType):
98-
m = self.TYPE_ARRAY_RE.fullmatch(info.data_type)
99-
if m:
100-
item_info = attrs.evolve(info, data_type=m.group(1))
101-
item_type = self.parse_type(table_path, item_info)
102-
col_type = Array(item_type=item_type)
103-
104-
# We currently ignore structs' structure, but later can parse it too. Examples:
105-
# - STRUCT<INT64, STRING(10)> (unnamed)
106-
# - STRUCT<foo INT64, bar STRING(10)> (named)
107-
# - STRUCT<foo INT64, bar ARRAY<INT64>> (with complex fields)
108-
# - STRUCT<foo INT64, bar STRUCT<a INT64, b INT64>> (nested)
109-
m = self.TYPE_STRUCT_RE.fullmatch(info.data_type)
110-
if m:
111-
col_type = Struct()
102+
if not isinstance(col_type, UnknownColType):
103+
return col_type
104+
105+
m = self.TYPE_ARRAY_RE.fullmatch(info.data_type)
106+
if m:
107+
item_info = attrs.evolve(info, data_type=m.group(1))
108+
item_type = self.parse_type(table_path, item_info)
109+
col_type = Array(item_type=item_type)
110+
return col_type
111+
112+
# We currently ignore structs' structure, but later can parse it too. Examples:
113+
# - STRUCT<INT64, STRING(10)> (unnamed)
114+
# - STRUCT<foo INT64, bar STRING(10)> (named)
115+
# - STRUCT<foo INT64, bar ARRAY<INT64>> (with complex fields)
116+
# - STRUCT<foo INT64, bar STRUCT<a INT64, b INT64>> (nested)
117+
m = self.TYPE_STRUCT_RE.fullmatch(info.data_type)
118+
if m:
119+
col_type = Struct()
120+
return col_type
121+
122+
m = self.TYPE_NUMERIC_RE.fullmatch(info.data_type)
123+
if m:
124+
precision = int(m.group(3)) if m.group(3) else None
125+
scale = int(m.group(4)) if m.group(4) else None
126+
127+
if scale is not None:
128+
# NUMERIC(..., scale) — scale is set explicitly
129+
effective_precision = scale
130+
elif precision is not None:
131+
# NUMERIC(...) — scale is missing but precision is set
132+
# effectively the same as NUMERIC(..., 0)
133+
effective_precision = 0
134+
else:
135+
# NUMERIC → default scale is 9
136+
effective_precision = 9
137+
col_type = Decimal(precision=effective_precision)
138+
return col_type
112139

113140
return col_type
114141

data_diff/databases/duckdb.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ class Dialect(BaseDialect):
4646
SUPPORTS_PRIMARY_KEY = True
4747
SUPPORTS_INDEXES = True
4848

49+
# https://duckdb.org/docs/sql/data_types/numeric#fixed-point-decimals
50+
# The default WIDTH and SCALE is DECIMAL(18, 3), if none are specified.
51+
DEFAULT_NUMERIC_PRECISION = 3
52+
4953
TYPE_CLASSES = {
5054
# Timestamps
5155
"TIMESTAMP WITH TIME ZONE": TimestampTZ,

data_diff/databases/postgresql.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ class PostgresqlDialect(BaseDialect):
4545
SUPPORTS_PRIMARY_KEY: ClassVar[bool] = True
4646
SUPPORTS_INDEXES = True
4747

48+
# https://www.postgresql.org/docs/current/datatype-numeric.html#DATATYPE-NUMERIC-DECIMAL
49+
# without any precision or scale creates an “unconstrained numeric” column
50+
# in which numeric values of any length can be stored, up to the implementation limits.
51+
# https://www.postgresql.org/docs/current/datatype-numeric.html#DATATYPE-NUMERIC-TABLE
52+
DEFAULT_NUMERIC_PRECISION = 16383
53+
4854
TYPE_CLASSES: ClassVar[Dict[str, Type[ColType]]] = {
4955
# Timestamps
5056
"timestamp with time zone": TimestampTZ,
@@ -185,10 +191,21 @@ def select_table_schema(self, path: DbPath) -> str:
185191
if database:
186192
info_schema_path.insert(0, database)
187193

188-
return (
189-
f"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale FROM {'.'.join(info_schema_path)} "
190-
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
191-
)
194+
return f"""SELECT column_name, data_type, datetime_precision,
195+
-- see comment for DEFAULT_NUMERIC_PRECISION
196+
CASE
197+
WHEN data_type = 'numeric'
198+
THEN coalesce(numeric_precision, 131072 + {self.dialect.DEFAULT_NUMERIC_PRECISION})
199+
ELSE numeric_precision
200+
END AS numeric_precision,
201+
CASE
202+
WHEN data_type = 'numeric'
203+
THEN coalesce(numeric_scale, {self.dialect.DEFAULT_NUMERIC_PRECISION})
204+
ELSE numeric_scale
205+
END AS numeric_scale
206+
FROM {'.'.join(info_schema_path)}
207+
WHERE table_name = '{table}' AND table_schema = '{schema}'
208+
"""
192209

193210
def select_table_unique_columns(self, path: DbPath) -> str:
194211
database, schema, table = self._normalize_table_path(path)

data_diff/databases/vertica.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ class Dialect(BaseDialect):
5757
"boolean": Boolean,
5858
}
5959

60+
# https://www.vertica.com/docs/9.3.x/HTML/Content/Authoring/SQLReferenceManual/DataTypes/Numeric/NUMERIC.htm#Default
61+
DEFAULT_NUMERIC_PRECISION = 15
62+
6063
def quote(self, s: str):
6164
return f'"{s}"'
6265

tests/test_database.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,36 @@ def test_three_part_support(self):
134134
d = db.query_table_schema(part.path)
135135
assert len(d) == 1
136136
db.query(part.drop())
137+
138+
139+
@test_each_database
140+
class TestNumericPrecisionParsing(unittest.TestCase):
141+
def test_specified_precision(self):
142+
name = "tbl_" + random_table_suffix()
143+
db = get_conn(self.db_cls)
144+
tbl = table(name, schema={"value": "DECIMAL(10, 2)"})
145+
db.query(tbl.create())
146+
t = table(name)
147+
raw_schema = db.query_table_schema(t.path)
148+
schema = db._process_table_schema(t.path, raw_schema)
149+
self.assertEqual(schema["value"].precision, 2)
150+
151+
def test_specified_zero_precision(self):
152+
name = "tbl_" + random_table_suffix()
153+
db = get_conn(self.db_cls)
154+
tbl = table(name, schema={"value": "DECIMAL(10)"})
155+
db.query(tbl.create())
156+
t = table(name)
157+
raw_schema = db.query_table_schema(t.path)
158+
schema = db._process_table_schema(t.path, raw_schema)
159+
self.assertEqual(schema["value"].precision, 0)
160+
161+
def test_default_precision(self):
162+
name = "tbl_" + random_table_suffix()
163+
db = get_conn(self.db_cls)
164+
tbl = table(name, schema={"value": "DECIMAL"})
165+
db.query(tbl.create())
166+
t = table(name)
167+
raw_schema = db.query_table_schema(t.path)
168+
schema = db._process_table_schema(t.path, raw_schema)
169+
self.assertEqual(schema["value"].precision, db.dialect.DEFAULT_NUMERIC_PRECISION)

0 commit comments

Comments
 (0)