Merge branch 'master' into redacted_log

erezsh · web-flow · commit 63948c526843 · 2022-07-25T15:47:10.000+02:00
diff --git a/README.md b/README.md
@@ -143,6 +143,10 @@ $ data-diff \
 If a database is not on the list, we'd still love to support it. Open an issue
 to discuss it.
 
+Note: Because URLs allow many special characters, and may collide with the syntax of your command-line,
+it's recommended to surround them with quotes. Alternatively, you may provide them in a TOML file via the `--config` option. 
+
+
 # How to install
 
 Requires Python 3.7+ with pip.
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -136,10 +136,8 @@ def _main(
         return
 
     key_column = key_column or "id"
-    if bisection_factor is None:
-        bisection_factor = DEFAULT_BISECTION_FACTOR
-    if bisection_threshold is None:
-        bisection_threshold = DEFAULT_BISECTION_THRESHOLD
+    bisection_factor = DEFAULT_BISECTION_FACTOR if bisection_factor is None else int(bisection_factor)
+    bisection_threshold = DEFAULT_BISECTION_THRESHOLD if bisection_threshold is None else int(bisection_threshold)
 
     threaded = True
     if threads is None:
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -205,8 +205,7 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
         fields = [self.normalize_uuid(c, String_UUID()) for c in text_columns]
         samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
         if not samples_by_row:
-            logger.warning(f"Table {table_path} is empty.")
-            return
+            raise ValueError(f"Table {table_path} is empty.")
 
         samples_by_col = list(zip(*samples_by_row))
 
diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -27,7 +27,7 @@ class Oracle(ThreadedDatabase):
     ROUNDS_ON_PREC_LOSS = True
 
     def __init__(self, *, host, database, thread_count, **kw):
-        self.kwargs = dict(dsn="%s/%s" % (host, database), **kw)
+        self.kwargs = dict(dsn="%s/%s" % (host, database) if database else host, **kw)
 
         self.default_schema = kw.get("user")
 
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -8,7 +8,7 @@
 from collections import defaultdict
 from typing import List, Tuple, Iterator, Optional
 import logging
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from runtype import dataclass
 
@@ -315,17 +315,16 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
             ('-', columns) for items in table2 but not in table1
             Where `columns` is a tuple of values for the involved columns, i.e. (id, ...extra)
         """
+        # Validate options
         if self.bisection_factor >= self.bisection_threshold:
             raise ValueError("Incorrect param values (bisection factor must be lower than threshold)")
         if self.bisection_factor < 2:
             raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
 
+        # Query and validate schema
         table1, table2 = self._threaded_call("with_schema", [table1, table2])
         self._validate_and_adjust_columns(table1, table2)
 
-        key_ranges = self._threaded_call("query_key_range", [table1, table2])
-        mins, maxs = zip(*key_ranges)
-
         key_type = table1._schema[table1.key_column]
         key_type2 = table2._schema[table2.key_column]
         if not isinstance(key_type, IKey):
@@ -334,23 +333,42 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
             raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
         assert key_type.python_type is key_type2.python_type
 
-        # We add 1 because our ranges are exclusive of the end (like in Python)
-        try:
-            min_key = min(map(key_type.python_type, mins))
-            max_key = max(map(key_type.python_type, maxs)) + 1
-        except (TypeError, ValueError) as e:
-            raise type(e)(f"Cannot apply {key_type} to {mins}, {maxs}.") from e
+        # Query min/max values
+        key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
 
-        table1 = table1.new(min_key=min_key, max_key=max_key)
-        table2 = table2.new(min_key=min_key, max_key=max_key)
+        # Start with the first completed value, so we don't waste time waiting
+        min_key1, max_key1 = self._parse_key_range_result(key_type, next(key_ranges))
+
+        table1, table2 = [t.new(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
 
         logger.info(
             f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
             f"key-range: {table1.min_key}..{table2.max_key}, "
             f"size: {table2.max_key-table1.min_key}"
         )
 
-        return self._bisect_and_diff_tables(table1, table2)
+        # Bisect (split) the table into segments, and diff them recursively.
+        yield from self._bisect_and_diff_tables(table1, table2)
+
+        # Now we check for the second min-max, to diff the portions we "missed".
+        min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
+
+        if min_key2 < min_key1:
+            pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
+            yield from self._bisect_and_diff_tables(*pre_tables)
+
+        if max_key2 > max_key1:
+            post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
+            yield from self._bisect_and_diff_tables(*post_tables)
+
+    def _parse_key_range_result(self, key_type, key_range):
+        mn, mx = key_range
+        cls = key_type.python_type
+        # We add 1 because our ranges are exclusive of the end (like in Python)
+        try:
+            return cls(mn), cls(mx) + 1
+        except (TypeError, ValueError) as e:
+            raise type(e)(f"Cannot apply {key_type} to {mn}, {mx}.") from e
 
     def _validate_and_adjust_columns(self, table1, table2):
         for c in table1._relevant_columns:
@@ -474,12 +492,26 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
         if checksum1 != checksum2:
             yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
 
-    def _thread_map(self, func, iter):
+    def _thread_map(self, func, iterable):
+        if not self.threaded:
+            return map(func, iterable)
+
+        with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
+            return task_pool.map(func, iterable)
+
+    def _threaded_call(self, func, iterable):
+        "Calls a method for each object in iterable."
+        return list(self._thread_map(methodcaller(func), iterable))
+
+    def _thread_as_completed(self, func, iterable):
         if not self.threaded:
-            return map(func, iter)
+            return map(func, iterable)
 
-        task_pool = ThreadPoolExecutor(max_workers=self.max_threadpool_size)
-        return task_pool.map(func, iter)
+        with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
+            futures = [task_pool.submit(func, item) for item in iterable]
+            for future in as_completed(futures):
+                yield future.result()
 
-    def _threaded_call(self, func, iter):
-        return list(self._thread_map(methodcaller(func), iter))
+    def _threaded_call_as_completed(self, func, iterable):
+        "Calls a method for each object in iterable. Returned in order of completion."
+        return self._thread_as_completed(methodcaller(func), iterable)
diff --git a/data_diff/sql.py b/data_diff/sql.py
@@ -6,6 +6,8 @@
 
 from runtype import dataclass
 
+from .utils import join_iter
+
 from .databases.database_types import AbstractDatabase, DbPath, DbKey, DbTime, ArithUUID
 
 
@@ -15,6 +17,8 @@ class Sql:
 
 SqlOrStr = Union[Sql, str]
 
+CONCAT_SEP = "|"
+
 
 @dataclass
 class Compiler:
@@ -122,7 +126,8 @@ class Checksum(Sql):
     def compile(self, c: Compiler):
         if len(self.exprs) > 1:
             compiled_exprs = [f"coalesce({c.compile(expr)}, '<null>')" for expr in self.exprs]
-            expr = c.database.concat(compiled_exprs)
+            separated = list(join_iter(f"'|'", compiled_exprs))
+            expr = c.database.concat(separated)
         else:
             # No need to coalesce - safe to assume that key cannot be null
             (expr,) = self.exprs
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -69,3 +69,11 @@ def remove_password_from_url(url: str, replace_with: str="***") -> str:
     netloc = _join_if_any("@", filter(None, [account, host]))
     replaced = parsed._replace(netloc=netloc)
     return replaced.geturl()
+
+def join_iter(joiner: Any, iterable: iter) -> iter:
+    it = iter(iterable)
+    yield next(it)
+    for i in it:
+        yield joiner
+        yield i
+
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,83 @@
+import logging
+import unittest
+import preql
+import arrow
+import subprocess
+import sys
+
+from data_diff import diff_tables, connect_to_table
+
+from .common import TEST_MYSQL_CONN_STRING
+
+
+def run_datadiff_cli(*args):
+    try:
+        stdout = subprocess.check_output([sys.executable, "-m", "data_diff"] + list(args), stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError as e:
+        logging.error(e.stderr)
+        raise
+    return stdout.splitlines()
+
+
+class TestCLI(unittest.TestCase):
+    def setUp(self) -> None:
+        self.preql = preql.Preql(TEST_MYSQL_CONN_STRING)
+        self.preql(
+            r"""
+            table test_cli {
+                datetime: datetime
+                comment: string
+            }
+            commit()
+
+            func add(date, comment) {
+                new test_cli(date, comment)
+            }
+        """
+        )
+        self.now = now = arrow.get(self.preql.now())
+        self.preql.add(now, "now")
+        self.preql.add(now, self.now.shift(seconds=-10))
+        self.preql.add(now, self.now.shift(seconds=-7))
+        self.preql.add(now, self.now.shift(seconds=-6))
+
+        self.preql(
+            r"""
+            const table test_cli_2 = test_cli
+            commit()
+        """
+        )
+
+        self.preql.add(self.now.shift(seconds=-3), "3 seconds ago")
+        self.preql.commit()
+
+    def tearDown(self) -> None:
+        self.preql.run_statement("drop table if exists test_cli")
+        self.preql.run_statement("drop table if exists test_cli_2")
+        self.preql.commit()
+        self.preql.close()
+
+        return super().tearDown()
+
+    def test_basic(self):
+        diff = run_datadiff_cli(TEST_MYSQL_CONN_STRING, "test_cli", TEST_MYSQL_CONN_STRING, "test_cli_2")
+        assert len(diff) == 1
+
+    def test_options(self):
+        diff = run_datadiff_cli(
+            TEST_MYSQL_CONN_STRING,
+            "test_cli",
+            TEST_MYSQL_CONN_STRING,
+            "test_cli_2",
+            "--bisection-factor",
+            "16",
+            "--bisection-threshold",
+            "10000",
+            "--limit",
+            "5",
+            "-t",
+            "datetime",
+            "--max-age",
+            "1h",
+        )
+        assert len(diff) == 1
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -254,7 +254,7 @@ def test_get_values(self):
         table = self.table.with_schema()
 
         self.assertEqual(1, table.count())
-        concatted = str(id_) + time
+        concatted = str(id_) + "|" + time
         self.assertEqual(str_to_checksum(concatted), table.count_and_checksum()[1])
 
     def test_diff_small_tables(self):
@@ -405,7 +405,7 @@ def test_string_keys(self):
             f"INSERT INTO {self.table_src} VALUES ('unexpected', '<-- this bad value should not break us')", None
         )
 
-        self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)
+        self.assertRaises(ValueError, list, differ.diff_tables(self.a, self.b))
 
 
 @test_per_database
@@ -592,7 +592,7 @@ def setUp(self):
 
     def test_right_table_empty(self):
         differ = TableDiffer()
-        self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)
+        self.assertRaises(ValueError, list, differ.diff_tables(self.a, self.b))
 
     def test_left_table_empty(self):
         queries = [
@@ -605,4 +605,4 @@ def test_left_table_empty(self):
         _commit(self.connection)
 
         differ = TableDiffer()
-        self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)
+        self.assertRaises(ValueError, list, differ.diff_tables(self.a, self.b))