datafold
diff --git a/‎.github/workflows/ci.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/ci_full.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci_full.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_diff/databases/_connect.py
Lines changed: 5 additions & 9 deletions b/‎data_diff/databases/_connect.py
Lines changed: 5 additions & 9 deletions
diff --git a/‎data_diff/databases/base.py
Lines changed: 1 addition & 5 deletions b/‎data_diff/databases/base.py
Lines changed: 1 addition & 5 deletions
diff --git a/‎data_diff/databases/duckdb.py
Lines changed: 15 additions & 1 deletion b/‎data_diff/databases/duckdb.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎data_diff/databases/mssql.py
Lines changed: 4 additions & 2 deletions b/‎data_diff/databases/mssql.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎data_diff/databases/redshift.py
Lines changed: 36 additions & 1 deletion b/‎data_diff/databases/redshift.py
Lines changed: 36 additions & 1 deletion
diff --git a/‎data_diff/databases/snowflake.py
Lines changed: 25 additions & 15 deletions b/‎data_diff/databases/snowflake.py
Lines changed: 25 additions & 15 deletions
diff --git a/‎data_diff/hashdiff_tables.py
Lines changed: 0 additions & 8 deletions b/‎data_diff/hashdiff_tables.py
Lines changed: 0 additions & 8 deletions
diff --git a/‎data_diff/joindiff_tables.py
Lines changed: 0 additions & 1 deletion b/‎data_diff/joindiff_tables.py
Lines changed: 0 additions & 1 deletion
@@ -68,6 +68,7 @@ jobs:
             DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
             DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
             DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
+            MOTHERDUCK_TOKEN: '${{ secrets.MOTHERDUCK_TOKEN }}'
         run: |
           chmod +x tests/waiting_for_stack_up.sh
           ./tests/waiting_for_stack_up.sh && TEST_ACROSS_ALL_DBS=0 poetry run unittest-parallel -j 16
@@ -64,6 +64,7 @@ jobs:
             DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
             # DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
             DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
+            MOTHERDUCK_TOKEN: '${{ secrets.MOTHERDUCK_TOKEN }}'
         run: |
           chmod +x tests/waiting_for_stack_up.sh
           ./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16
@@ -26,7 +26,7 @@
 from data_diff.databases.mssql import MsSQL
 
 
-@attrs.define(frozen=True)
+@attrs.frozen
 class MatchUriPath:
     database_cls: Type[Database]
 
@@ -98,13 +98,11 @@ class Connect:
     """Provides methods for connecting to a supported database using a URL or connection dict."""
 
     database_by_scheme: Dict[str, Database]
-    match_uri_path: Dict[str, MatchUriPath]
     conn_cache: MutableMapping[Hashable, Database]
 
     def __init__(self, database_by_scheme: Dict[str, Database] = DATABASE_BY_SCHEME):
         super().__init__()
         self.database_by_scheme = database_by_scheme
-        self.match_uri_path = {name: MatchUriPath(cls) for name, cls in database_by_scheme.items()}
         self.conn_cache = weakref.WeakValueDictionary()
 
     def for_databases(self, *dbs) -> Self:
@@ -157,12 +155,10 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
             return self.connect_with_dict(conn_dict, thread_count, **kwargs)
 
         try:
-            matcher = self.match_uri_path[scheme]
+            cls = self.database_by_scheme[scheme]
         except KeyError:
             raise NotImplementedError(f"Scheme '{scheme}' currently not supported")
 
-        cls = matcher.database_cls
-
         if scheme == "databricks":
             assert not dsn.user
             kw = {}
@@ -175,6 +171,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
             kw["filepath"] = dsn.dbname
             kw["dbname"] = dsn.user
         else:
+            matcher = MatchUriPath(cls)
             kw = matcher.match_path(dsn)
 
             if scheme == "bigquery":
@@ -198,7 +195,7 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
 
         kw = {k: v for k, v in kw.items() if v is not None}
 
-        if issubclass(cls, ThreadedDatabase):
+        if isinstance(cls, type) and issubclass(cls, ThreadedDatabase):
             db = cls(thread_count=thread_count, **kw, **kwargs)
         else:
             db = cls(**kw, **kwargs)
@@ -209,11 +206,10 @@ def connect_with_dict(self, d, thread_count, **kwargs):
         d = dict(d)
         driver = d.pop("driver")
         try:
-            matcher = self.match_uri_path[driver]
+            cls = self.database_by_scheme[driver]
         except KeyError:
             raise NotImplementedError(f"Driver '{driver}' currently not supported")
 
-        cls = matcher.database_cls
         if issubclass(cls, ThreadedDatabase):
             db = cls(thread_count=thread_count, **d, **kwargs)
         else:
 
@@ -1093,11 +1093,7 @@ def _refine_coltypes(
             list,
             log_message=table_path,
         )
-        if not samples_by_row:
-            raise ValueError(f"Table {table_path} is empty.")
-
-        samples_by_col = list(zip(*samples_by_row))
-
+        samples_by_col = list(zip(*samples_by_row)) if samples_by_row else [[]] * len(text_columns)
         for col_name, samples in safezip(text_columns, samples_by_col):
             uuid_samples = [s for s in samples if s and is_uuid(s)]
 
 
@@ -1,6 +1,7 @@
 from typing import Any, ClassVar, Dict, Union, Type
 
 import attrs
+from packaging.version import parse as parse_version
 
 from data_diff.utils import match_regexps
 from data_diff.abcs.database_types import (
@@ -27,6 +28,7 @@
     CHECKSUM_OFFSET,
 )
 from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS
+from data_diff.version import __version__
 
 
 @import_helper("duckdb")
@@ -148,9 +150,21 @@ def close(self):
     def create_connection(self):
         ddb = import_duckdb()
         try:
-            return ddb.connect(self._args["filepath"])
+            # custom_user_agent is only available in duckdb >= 0.9.2
+            if parse_version(ddb.__version__) >= parse_version("0.9.2"):
+                custom_user_agent = f"data-diff/v{__version__}"
+                config = {"custom_user_agent": custom_user_agent}
+                connection = ddb.connect(database=self._args["filepath"], config=config)
+                custom_user_agent_results = connection.sql("PRAGMA USER_AGENT;").fetchall()
+                custom_user_agent_filtered = custom_user_agent_results[0][0]
+                assert custom_user_agent in custom_user_agent_filtered
+            else:
+                connection = ddb.connect(database=self._args["filepath"])
+            return connection
         except ddb.OperationalError as e:
             raise ConnectError(*e.args) from e
+        except AssertionError:
+            raise ConnectError("Assertion failed: Custom user agent is invalid.") from None
 
     def select_table_schema(self, path: DbPath) -> str:
         database, schema, table = self._normalize_table_path(path)
 
@@ -119,13 +119,15 @@ def limit_select(
     ) -> str:
         if offset:
             raise NotImplementedError("No support for OFFSET in query")
-
         result = ""
         if not has_order_by:
             result += "ORDER BY 1"
 
         result += f" OFFSET 0 ROWS FETCH NEXT {limit} ROWS ONLY"
-        return f"SELECT * FROM ({select_query}) AS LIMITED_SELECT {result}"
+
+        # mssql requires that subquery columns are all aliased, so
+        # don't wrap in an outer select
+        return f"{select_query} {result}"
 
     def constant_values(self, rows) -> str:
         values = ", ".join("(%s)" % ", ".join(self._constant_value(v) for v in row) for row in rows)
 
@@ -122,6 +122,38 @@ def query_pg_get_cols(self, path: DbPath) -> Dict[str, tuple]:
 
         return schema_dict
 
+    def select_svv_columns_schema(self, path: DbPath) -> Dict[str, tuple]:
+        database, schema, table = self._normalize_table_path(path)
+
+        db_clause = ""
+        if database:
+            db_clause = f" AND table_catalog = '{database.lower()}'"
+
+        return (
+            f"""
+            select
+            	distinct
+                column_name,
+            	data_type,
+            	datetime_precision,
+            	numeric_precision,
+            	numeric_scale
+            from
+            	svv_columns
+            where table_name = '{table.lower()}' and table_schema = '{schema.lower()}'
+            """
+            + db_clause
+        )
+
+    def query_svv_columns(self, path: DbPath) -> Dict[str, tuple]:
+        rows = self.query(self.select_svv_columns_schema(path), list)
+        if not rows:
+            raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
+
+        d = {r[0]: r for r in rows}
+        assert len(d) == len(rows)
+        return d
+
     # when using a non-information_schema source, strip (N) from type(N) etc. to match
     # typical information_schema output
     def _normalize_schema_info(self, rows) -> Dict[str, tuple]:
@@ -150,7 +182,10 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
             try:
                 return self.query_external_table_schema(path)
             except RuntimeError:
-                return self.query_pg_get_cols(path)
+                try:
+                    return self.query_pg_get_cols(path)
+                except Exception:
+                    return self.query_svv_columns(path)
 
     def _normalize_table_path(self, path: DbPath) -> DbPath:
         if len(path) == 1:
 
@@ -1,4 +1,5 @@
-from typing import Any, ClassVar, Union, List, Type
+import base64
+from typing import Any, ClassVar, Union, List, Type, Optional
 import logging
 
 import attrs
@@ -103,7 +104,7 @@ class Snowflake(Database):
 
     _conn: Any
 
-    def __init__(self, *, schema: str, **kw):
+    def __init__(self, *, schema: str, key: Optional[str] = None, key_content: Optional[str] = None, **kw):
         super().__init__()
         snowflake, serialization, default_backend = import_snowflake()
         logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
@@ -113,20 +114,29 @@ def __init__(self, *, schema: str, **kw):
         logging.getLogger("snowflake.connector.network").disabled = True
 
         assert '"' not in schema, "Schema name should not contain quotes!"
+        if key_content and key:
+            raise ConnectError("Only key value or key file path can be specified, not both")
+
+        key_bytes = None
+        if key:
+            with open(key, "rb") as f:
+                key_bytes = f.read()
+        if key_content:
+            key_bytes = base64.b64decode(key_content)
+
         # If a private key is used, read it from the specified path and pass it as "private_key" to the connector.
-        if "key" in kw:
-            with open(kw.get("key"), "rb") as key:
-                if "password" in kw:
-                    raise ConnectError("Cannot use password and key at the same time")
-                if kw.get("private_key_passphrase"):
-                    encoded_passphrase = kw.get("private_key_passphrase").encode()
-                else:
-                    encoded_passphrase = None
-                p_key = serialization.load_pem_private_key(
-                    key.read(),
-                    password=encoded_passphrase,
-                    backend=default_backend(),
-                )
+        if key_bytes:
+            if "password" in kw:
+                raise ConnectError("Cannot use password and key at the same time")
+            if kw.get("private_key_passphrase"):
+                encoded_passphrase = kw.get("private_key_passphrase").encode()
+            else:
+                encoded_passphrase = None
+            p_key = serialization.load_pem_private_key(
+                key_bytes,
+                password=encoded_passphrase,
+                backend=default_backend(),
+            )
 
             kw["private_key"] = p_key.private_bytes(
                 encoding=serialization.Encoding.DER,
 
@@ -118,14 +118,6 @@ def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegmen
                 if lowest.precision != col2.precision:
                     table2._schema[c2] = attrs.evolve(col2, precision=lowest.precision)
 
-            elif isinstance(col1, ColType_UUID):
-                if strict and not isinstance(col2, ColType_UUID):
-                    raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
-
-            elif isinstance(col1, StringType):
-                if strict and not isinstance(col2, StringType):
-                    raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
-
         for t in [table1, table2]:
             for c in t.relevant_columns:
                 ctype = t._schema[c]
 
@@ -343,7 +343,6 @@ def _count_diff_per_column(
         table1: Optional[TableSegment] = None,
         table2: Optional[TableSegment] = None,
     ):
-        logger.info(type(table1))
         logger.debug(f"Counting differences per column: {table1.table_path} <> {table2.table_path}")
         is_diff_cols_counts = db.query(
             diff_rows.select(sum_(this[c]) for c in is_diff_cols),