feat: added speedup, as well as email and phone (#5)

stephanGarland · web-flow · commit d7bfdfc51f8c · 2023-02-17T12:30:57.000-05:00
diff --git a/README.md b/README.md
@@ -7,13 +7,14 @@ Ever want to quickly create millions of rows of random data for a database, with
 ## Usage
 
 ```shell
-usage: create_entries.py [-h] [--extended-help] [-c] [-d] [--drop-table] [--force] [-f {csv,mysql,postgresql,sqlserver}] [--generate-dates] [-g] [-i INPUT] [-n NUM] [-o OUTPUT]
-                         [-r] [-t TABLE] [--validate VALIDATE]
+usage: create_entries.py [-h] [--extended-help] [-c] [--country {au,de,fr,ke,jp,mx,ua,uk,us}] [-d] [--drop-table] [--force] [-f {csv,mysql,postgresql,sqlserver}] [--generate-dates] [-g] [-i INPUT] [-n NUM] [-o OUTPUT] [-r] [-t TABLE] [--validate VALIDATE]
 
 options:
   -h, --help            show this help message and exit
   --extended-help       Print extended help
   -c, --chunk           Chunk SQL INSERT statements
+  --country {au,de,fr,ke,jp,mx,ua,uk,us}
+                        The country's phone number structure to use if generating phone numbers
   -d, --debug           Print tracebacks for errors
   --drop-table          WARNING: DESTRUCTIVE - use DROP TABLE with generation
   --force               WARNING: DESTRUCTIVE - overwrite any files
@@ -63,6 +64,9 @@ GenSQL expects a JSON input schema, of the format:
 * This uses a C library to perform random shuffles. There are no external libraries, so as long as you have a reasonably new compiler, `make` should work for you.
 * `--force` and `--drop-table` have warnings for a reason. If you run a query with `DROP TABLE IF EXISTS`, please be sure of what you're doing.
 * `--random` allows for TEXT and JSON columns to have varying amounts of length, which may or may not matter to you. It will cause a ~10% slowdown. If not selected, a deterministic 20% of the rows in these columns will have a longer length than the rest. If this also bothers you, change DEFAULT_VARYING_LENGTH to `False`.
+* `--generate-dates` takes practically the same amount of time, or slightly longer, than just having them generated on-demand. It's useful if you want to have the same set of datetimes for a series of tables, although their actual ordering for row generation will remain random.
+* Using a column of name `phone` will generate realistic - to the best of my knowledge - phone numbers for a given country (very limited set). It's currently non-optimized for performance, and thus incurs a ~40% slowdown over the baseline. A solution in C may or may not speed things up, as it's not that performing `random.shuffle()` on a 10-digit number is slow, it's that doing so `n` times is a lot of function calls. Inlining C functions in Python [does exist](https://github.com/ssize-t/inlinec), but the non-caching of its compilation would probably negate any savings.
+* Similarly, a column of name `email` will generate realistic email addresses (all with `.com` TLD), and will incur a ~40% slowdown over the baseline.
 
 ### Loading data
 
@@ -120,6 +124,18 @@ mysql -h 127.0.0.1 -usgarland -ppassword test -e   0.02s user 0.01s system 0% cp
 
 Or, in terms of ratios, using chunking is approximately 3x as fast as the baseline, while loading a CSV is approximately 4x as fast as the baseline.
 
+```
+# baseline
+❯ time mysql -h localhost -usgarland -ppassword test < test.sql
+mysql -h localhost -usgarland -ppassword test < test.sql  32.75s user 10.90s system 14% cpu 4:55.91 total
+# no unique checks
+❯ time mysql -h localhost -usgarland -ppassword test < test.sql
+mysql -h localhost -usgarland -ppassword test < test.sql  25.11s user 8.67s system 14% cpu 3:48.38 total
+# no unique checks, single insert, 1 gb buffer size
+❯ time mysql -h localhost -usgarland -ppassword --max-allowed-packet=1073741824 test < test.sql
+mysql -h localhost -usgarland -ppassword --max-allowed-packet=1073741824 test  10.64s user 0.91s system 7% cpu 2:28.29 total
+```
+
 ## Benchmarks
 
 **NOTE: THESE ARE NOT CURRENT, AND SHOULD NOT BE RELIED ON**
@@ -184,8 +200,8 @@ python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table -o   4
 
 ## TODO
 
-* Support other SQL varieties, as well as CSV and TXT.
-* Add more column data sources, like addresses, phone numbers, and email addresses.
+* Support other SQL varieties.
+* Add more column data sources.
 * Create tests.
 * Come up with a coherent exception handling mechanism.
 * Add logging, maybe.
diff --git a/create_entries.py b/create_entries.py
@@ -23,6 +23,7 @@
     JSON_OBJ_MAX_KEYS,
     JSON_OBJ_MAX_VALS,
     MYSQL_INT_MIN_MAX,
+    PHONE_NUMBER,
 )
 from utilities import utilities
 
@@ -90,6 +91,8 @@ def _add_error(error_schema: dict, key: tuple, value: dict, error_message: str):
             "timestamp",
             "text",
             "json",
+            "email",
+            "phone",
         ]
         pks = []
         errors = {}
@@ -116,6 +119,20 @@ def _add_error(error_schema: dict, key: tuple, value: dict, error_message: str):
                 ]
             if col_pk:
                 pks.append(k)
+            if k == "phone" and "char" not in col_type:
+                _add_error(
+                    errors,
+                    (k, "type"),
+                    v,
+                    f"column `{k}` must be of type CHAR or VARCHAR",
+                )
+            if k == "phone" and col_unique:
+                _add_error(
+                    errors,
+                    (k, "unique"),
+                    v,
+                    f"unique is not a valid option for column `{k}` - this is a performance decision; numbers are still unlikely to collide",
+                )
             if not col_type:
                 _add_error(
                     errors,
@@ -135,7 +152,7 @@ def _add_error(error_schema: dict, key: tuple, value: dict, error_message: str):
                     errors,
                     (k, "width"),
                     v,
-                    f"column type `{col_type}` is not supported",
+                    f"width is not a valid option for column `{k}` of type `{col_type}`",
                 )
             if col_autoinc and "int" not in col_type:
                 _add_error(
@@ -260,6 +277,9 @@ def mysql(
             msg += f"DROP TABLE IF EXISTS `{tbl_name}`;\n"
         msg += f"CREATE TABLE `{tbl_name}` (\n"
         for col, col_attributes in schema.items():
+            # this may expand in the future
+            if col in ["phone"]:
+                cols[col]["create_ranged_arr"] = True
             col_opts = []
             for k, v in col_attributes.items():
                 match k:
@@ -322,6 +342,8 @@ def __init__(self, args, schema, tbl_name, tbl_cols, tbl_create):
         self.tbl_cols = tbl_cols
         self.tbl_create = tbl_create
         self.tbl_name = tbl_name
+        _has_monotonic = False
+        _has_unique = False
 
         # exceeding auto_increment capacity is checked at schema validation, but since
         # the user can specify --validate without passing --num, uniques have to be checked here
@@ -336,18 +358,25 @@ def __init__(self, args, schema, tbl_name, tbl_cols, tbl_create):
                     f"MYSQL_MAX_{v['type'].upper().split()[0]}_SIGNED"
                 ]
 
+            if v.get("auto_inc"):
+                _has_monotonic = True
             if v.get("unique"):
+                _has_unique = True
                 if self.args.num > col_max_val:
                     raise TooManyRowsError(k, self.args.num, col_max_val) from None
+            # if uniquity isn't required, and the requested number of rows is greater
+            # than the column can handle, just set it to the column's max since we can repeat
             else:
                 if self.args.num > col_max_val:
                     self.rand_max_id = col_max_val
                 else:
                     self.rand_max_id = self.args.num
 
-        self.monotonic_id = self.allocator(self.args.num)
-        self.random_id = self.allocator(self.rand_max_id, shuffle=True)
-        self.unique_id = self.allocator(self.args.num, shuffle=True)
+        if _has_monotonic:
+            self.monotonic_id = self.allocator(0, self.args.num)
+        self.random_id = self.allocator(0, self.rand_max_id, shuffle=True)
+        if _has_unique:
+            self.unique_id = self.allocator(0, self.args.num, shuffle=True)
         try:
             with open("content/dates.txt", "r") as f:
                 self.dates = f.readlines()
@@ -380,9 +409,9 @@ def sample(
             sample_list.append(iterable[idx])
         return sample_list
 
-    def make_row(self, schema: dict, idx: int) -> dict:
+    def make_row(self, schema: dict, idx: int, has_timestamp: bool) -> dict:
         row = {}
-        if any("timestamp" in s.values() for s in schema.values()):
+        if has_timestamp:
             date = self.sample(self.dates, self.args.num)
         for col, opts in schema.items():
             if "id" in col:
@@ -395,7 +424,6 @@ def make_row(self, schema: dict, idx: int) -> dict:
 
                     # these are appended to the right of the deque, so they won't be immediately repeated
                     self.random_id.release(row[col])
-
             elif col == "first_name":
                 random_first = self.sample(self.first_names, self.num_rows_first_names)
                 first_name = f"{random_first}".replace("'", "''")
@@ -414,13 +442,14 @@ def make_row(self, schema: dict, idx: int) -> dict:
 
             elif schema[col]["type"] == "json":
                 json_dict = {}
-                keys = self.sample(
+                json_keys = self.sample(
                     self.wordlist, self.num_rows_wordlist, JSON_OBJ_MAX_KEYS
                 )
-                vals = self.sample(
-                    self.wordlist, self.num_rows_wordlist, JSON_OBJ_MAX_VALS
+                # grab an extra for use with email if needed
+                json_vals = self.sample(
+                    self.wordlist, self.num_rows_wordlist, JSON_OBJ_MAX_VALS + 1
                 )
-                json_dict[keys.pop()] = vals.pop()
+                json_dict[json_keys.pop()] = json_vals.pop()
                 max_rows_pct = float(
                     schema.get(col, {}).get("max_length", DEFAULT_MAX_FIELD_PCT)
                 )
@@ -432,13 +461,30 @@ def make_row(self, schema: dict, idx: int) -> dict:
                     json_arr_len = ceil((JSON_OBJ_MAX_VALS - 1) * max_rows_pct)
                 # make 20% of the JSON objects nested with a list object of length
                 if not idx % 5:
-                    key = keys.pop()
+                    key = json_keys.pop()
                     json_dict[key] = {}
-                    json_dict[key][keys.pop()] = [
-                        vals.pop() for _ in range(json_arr_len)
+                    json_dict[key][json_keys.pop()] = [
+                        json_vals.pop() for _ in range(json_arr_len)
                     ]
                 row[col] = f"'{json.dumps(json_dict)}'"
 
+            elif col == "email":
+                try:
+                    email_domain = json_vals.pop()
+                except UnboundLocalError:
+                    email_domain = self.sample(self.wordlist, self.num_rows_wordlist)
+                try:
+                    email_local = random_first
+                except UnboundLocalError:
+                    email_local = self.sample(
+                        self.first_names, self.num_rows_first_names
+                    )
+                row[col] = f"'{email_local.lower()}@{email_domain}.com'"
+            elif col == "phone":
+                phone_digits = [str(x) for x in range(10)]
+                random.shuffle(phone_digits)
+                phone_str = "".join(phone_digits)
+                row[col] = f"'{PHONE_NUMBER[args.country](phone_str)}'"
             elif schema[col]["type"] == "text":
                 max_rows_pct = float(
                     schema.get(col, {}).get("max_length", DEFAULT_MAX_FIELD_PCT)
@@ -490,8 +536,8 @@ def make_sql_rows(self, vals: list) -> list:
                 chunk_list = vals[i : i + DEFAULT_INSERT_CHUNK_SIZE]
                 for row in chunk_list:
                     insert_rows.append(f"({row}),\n")
-                # if we reach the end of a chunk list, make the multi-insert a commit by swapping
-                # the last comma to a semi-colon
+                # if we reach the end of a chunk list, make the multi-insert statement a single
+                # query by swapping the last comma to a semi-colon
                 insert_rows[-1] = insert_rows[-1][::-1].replace(",", ";", 1)[::-1]
         else:
             for row in vals:
@@ -508,8 +554,9 @@ def make_sql_rows(self, vals: list) -> list:
     def run(self):
         sql_inserts = []
         random.seed(os.urandom(4))
+        _has_timestamp = any("timestamp" in s.values() for s in self.schema.values())
         for i in range(1, self.args.num + 1):
-            row = self.make_row(self.schema, i)
+            row = self.make_row(self.schema, i, _has_timestamp)
             sql_inserts.append(row)
         vals = [",".join(str(v) for v in d.values()) for d in sql_inserts]
         match args.filetype:
diff --git a/full.json b/full.json
@@ -38,5 +38,10 @@
         "type": "timestamp",
         "nullable": "true",
         "default": "NULL"
+    },
+    "email": {
+        "type": "varchar",
+        "width": "255",
+        "nullable": "true"
     }
 }
diff --git a/library/fast_shuffle.c b/library/fast_shuffle.c
@@ -14,6 +14,18 @@ uint32_t *fill_array(uint32_t size) {
     return arr;
 }
 
+uint32_t *fill_array_range(uint32_t start, uint32_t end) {
+    uint32_t size = end - start;
+    uint32_t *arr = calloc(size, sizeof(uint32_t));
+    if (!arr) {
+        return NULL;
+    }
+    for (uint32_t i = start; i <= end; i++) {
+        arr[i] = i;
+    }
+    return arr;
+}
+
 uint32_t right_shift(uint32_t range, uint32_t *seed) {
     uint64_t random32bit, multiresult;
     uint32_t leftover, threshold;
diff --git a/only_email.json b/only_email.json
@@ -0,0 +1,25 @@
+
+{
+    "user_id": {
+        "type": "bigint unsigned",
+        "nullable": "false",
+        "auto increment": "true",
+        "primary key": "true"
+    },
+    "email": {
+        "type": "varchar",
+        "width": "255",
+        "nullable": "true"
+    },
+    "external_id": {
+        "type": "bigint unsigned",
+        "nullable": "false",
+        "unique": "true",
+        "default": "0"
+    },
+    "last_modified": {
+        "type": "timestamp",
+        "nullable": "true",
+        "default": "null"
+    }
+}
diff --git a/only_phone.json b/only_phone.json
@@ -0,0 +1,25 @@
+
+{
+    "user_id": {
+        "type": "bigint unsigned",
+        "nullable": "false",
+        "auto increment": "true",
+        "primary key": "true"
+    },
+    "phone": {
+        "type": "varchar",
+        "width": "255",
+        "nullable": "true"
+    },
+    "external_id": {
+        "type": "bigint unsigned",
+        "nullable": "false",
+        "unique": "true",
+        "default": "0"
+    },
+    "last_modified": {
+        "type": "timestamp",
+        "nullable": "true",
+        "default": "null"
+    }
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/utilities/constants.py b/utilities/constants.py
diff --git a/utilities/utilities.py b/utilities/utilities.py

Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,10 @@`
`38`	`38`	`"type": "timestamp",`
`39`	`39`	`"nullable": "true",`
`40`	`40`	`"default": "NULL"`
	`41`	`+ },`
	`42`	`+ "email": {`
	`43`	`+ "type": "varchar",`
	`44`	`+ "width": "255",`
	`45`	`+ "nullable": "true"`
`41`	`46`	`}`
`42`	`47`	`}`