stephanGarland
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile
Lines changed: 24 additions & 2 deletions b/‎Makefile
Lines changed: 24 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 45 additions & 33 deletions b/‎README.md
Lines changed: 45 additions & 33 deletions
diff --git a/‎broken.json
Lines changed: 0 additions & 34 deletions b/‎broken.json
Lines changed: 0 additions & 34 deletions
@@ -173,3 +173,4 @@ poetry.toml
 *.sql
 *.gz
 library/fast_shuffle.so
+library/uuid.so
@@ -1,5 +1,27 @@
+CC := gcc
+CFLAGS := -Wextra -Wall -O3
+LDLIBS := -luuid
+LDFLAGS := /usr/lib/x86_64-linux-gnu
+
+ifeq ($(shell uname), Darwin)
+	MAC_HEADER_DIR := $(shell find /opt/homebrew/Cellar/ossp-uuid -name "uuid.h" -exec dirname {} \;)
+	MAC_HEADER_DIR := $(strip $(MAC_HEADER_DIR))
+	LDFLAGS := /opt/homebrew/lib
+endif
+
+.PHONY: make
 make:
-	gcc -Wextra -Wall -O3 -shared library/fast_shuffle.c -o library/fast_shuffle.so
+	@if [ ! -f $(LDFLAGS)/libuuid.a ]; then \
+		echo "FATAL: libuuid.a not found"; \
+		echo "On MacOS, use: brew install ossp-uuid"; \
+		echo "On Debian/Ubuntu, use: sudo apt-get install uuid-dev"; \
+		echo "On CentOS/Fedora/RHEL, use: sudo yum install libuuid-devel"; \
+		echo "On openSUSE, use: sudo zypper in libuuid-devel"; \
+		exit 1; \
+	fi
+	$(CC) $(CFLAGS) -shared library/fast_shuffle.c -o library/fast_shuffle.so
+	$(CC) $(CFLAGS) -shared library/uuid.c -L$(LDFLAGS) $(LDLIBS) -o library/uuid.so
 
+.PHONY: clean
 clean:
-	rm -f library/fast_shuffle.so
+	rm -f library/fast_shuffle.so library/uuid.so
@@ -7,35 +7,40 @@ Ever want to quickly create millions of rows of random data for a database, with
 ## Usage
 
 ```shell
-usage: create_entries.py [-h] [--extended-help] [-c] [--country {au,de,fr,ke,jp,mx,ua,uk,us}] [-d] [--drop-table] [--force] [-f {csv,mysql,postgresql,sqlserver}] [--generate-dates] [-g] [-i INPUT] [-n NUM] [-o OUTPUT] [-r] [-t TABLE] [--validate VALIDATE]
+usage: gensql.py [-h] [--extended-help] [--country {random,au,de,fr,gb,ke,jp,mx,ua,us}] [-d] [--drop-table] [--force] [-f {csv,mysql,postgresql,sqlserver}] [--fixed-length]
+                 [--generate-dates] [-g] [-i INPUT] [--no-check] [--no-chunk] [-n NUM] [-o OUTPUT] [-q] [-r] [-t TABLE] [--validate VALIDATE]
 
 options:
   -h, --help            show this help message and exit
   --extended-help       Print extended help
-  -c, --chunk           Chunk SQL INSERT statements
-  --country {au,de,fr,ke,jp,mx,ua,uk,us}
-                        The country's phone number structure to use if generating phone numbers
+  --country {random,au,de,fr,gb,ke,jp,mx,ua,us}
+                        A specific country (or random) to use for cities, phone numbers, etc.
   -d, --debug           Print tracebacks for errors
   --drop-table          WARNING: DESTRUCTIVE - use DROP TABLE with generation
   --force               WARNING: DESTRUCTIVE - overwrite any files
   -f {csv,mysql,postgresql,sqlserver}, --filetype {csv,mysql,postgresql,sqlserver}
                         Filetype to generate
+  --fixed-length        Disable any variations in length for JSON arrays, text, etc.
   --generate-dates      Generate a file of datetimes for later use
   -g, --generate-skeleton
                         Generate a skeleton input JSON schema
   -i INPUT, --input INPUT
                         Input schema (JSON)
-  -n NUM, --num NUM     The number of rows to generate
+  --no-check            Do not perform validation checks for unique columns
+  --no-chunk            Do not chunk SQL INSERT statements
+  -n NUM, --num NUM     The number of rows to generate - defaults to 1000
   -o OUTPUT, --output OUTPUT
-                        Output filename
+                        Output filename - defaults to gensql
+  -q, --quiet           Suppress printing various informational messages
   -r, --random          Enable randomness on the length of some items
   -t TABLE, --table TABLE
-                        Table name to generate SQL for
+                        Table name to generate SQL for - defaults to the filename
   --validate VALIDATE   Validate an input JSON schema
 ```
 
 ### Usage example
 
+0. Either build the C libraries with `make`, or execute `run.sh` to symlink the correct file for your arch, if available.
 1. Create a schema if you'd like, or use the included examples.
 
 ```
@@ -50,7 +55,7 @@ GenSQL expects a JSON input schema, of the format:
             }
 ```
 2. If necessary, build the C library with the included Makefile. Otherwise, rename the included file for your platform to `fast_shuffle.so` (or change the name ctypes is looking for, your choice).
-3. Run GenSQL, example `python3 create_entries.py -i $YOUR_SCHEMA.json -n 10000 -f mysql`.
+3. Run GenSQL, example `python3 gensql.py -i $YOUR_SCHEMA.json -n 10000 -f mysql`.
 
 ## Requirements
 
@@ -61,10 +66,16 @@ GenSQL expects a JSON input schema, of the format:
 
 * The `--filetype` flag only supports `csv` and `mysql`. The only supported RDBMS is MySQL (probably 8.x; it _might_ work with 5.7.8 if you want a JSON column, and earlier if you don't).
 * Generated datetimes are in UTC, i.e. no DST events exist. If you remove the query to set the session's timezone, you may have a bad time.
-* This uses a C library to perform random shuffles. There are no external libraries, so as long as you have a reasonably new compiler, `make` should work for you.
+* This uses a C library for a few functions, notably filling large arrays and shuffling them. For UUID creation, the library <uuid/uuid.h> is required to build the shared library.
+* Currently, generating UUIDs only supports v1 and v4, and if they're to be stored as `BINARY` types, only .sql file format is supported. Also as an aside, it's a terrible idea to use a UUID (at least v4) as a PK in InnoDB, so please be sure of what you're doing. If you don't believe me, generate one, and another using a monotonic integer or something similar, and compare on-disk sizes for the tablespaces.
 * `--force` and `--drop-table` have warnings for a reason. If you run a query with `DROP TABLE IF EXISTS`, please be sure of what you're doing.
-* `--random` allows for TEXT and JSON columns to have varying amounts of length, which may or may not matter to you. It will cause a ~10% slowdown. If not selected, a deterministic 20% of the rows in these columns will have a longer length than the rest. If this also bothers you, change DEFAULT_VARYING_LENGTH to `False`.
+* `--random` allows for TEXT and JSON columns to have varying amounts of length, which may or may not matter to you. It will cause a ~10% slowdown. If not selected, a deterministic 20% of the rows in these columns will have a longer length than the rest. If this also bothers you, use `--fixed-length`.
 * `--generate-dates` takes practically the same amount of time, or slightly longer, than just having them generated on-demand. It's useful if you want to have the same set of datetimes for a series of tables, although their actual ordering for row generation will remain random.
+* Any column with `id` in its name will by default be assumed to be an integer type, and will have integers generated for it. You can provide hints to disable this, or to enable it for columns without `id` in their names, by using `is_id: {true, false}` in your schema.
+* To have an empty JSON array be set as the default value for a JSON column, use the default value `array()`.
+* The generated values for a JSON column can be an object of random words (the default), or an array of random integers. For the latter, set the hint `is_numeric_array` in the schema's object.
+* To have a column be given no `INSERT` statements, e.g. remain empty / with its default value, set the hint `is_empty: true` in the schema definition for the column.
+* To have the current datetime statically defined as the default value for a TIMESTAMP column, use the default value `static_now()`. To also have the column's default automatically update the timestamp, use the default value `now()`. To have the column's default value be NULL, but update automatically to the current timestamp when the row is updated, use `null_now()`.
 * Using a column of name `phone` will generate realistic - to the best of my knowledge - phone numbers for a given country (very limited set). It's currently non-optimized for performance, and thus incurs a ~40% slowdown over the baseline. A solution in C may or may not speed things up, as it's not that performing `random.shuffle()` on a 10-digit number is slow, it's that doing so `n` times is a lot of function calls. Inlining C functions in Python [does exist](https://github.com/ssize-t/inlinec), but the non-caching of its compilation would probably negate any savings.
 * Similarly, a column of name `email` will generate realistic email addresses (all with `.com` TLD), and will incur a ~40% slowdown over the baseline.
 
@@ -81,7 +92,8 @@ And then, from within the `mysql` client:
 
 ```mysql
 mysql> SET @@time_zone = '+00:00';
-mysql> LOAD DATA INFILE '/path/to/your/file.csv' INTO TABLE $TABLE_NAME FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY "'" IGNORE 1 LINES;
+mysql> SET @@unique_checks = 0;
+mysql> LOAD DATA INFILE '/path/to/your/file.csv' INTO TABLE $TABLE_NAME FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY "'" IGNORE 1 LINES ($COL_0, $COL_1, ... $COL_N);
 Query OK, 1000 rows affected (1.00 sec)
 Records: 1000  Deleted: 0  Skipped: 0  Warnings: 0
 ```
@@ -101,9 +113,9 @@ However, if you don't have access to the host, there are some tricks GenSQL has
 * Disabling autocommit
   * Normally, each statement is committed one at a time. With this disabled, an explicit `COMMIT` statement must be used to commit.
 * Disabling unique checks
-  * Normally, the SQL engine will check that any columns declaring `UNIQUE` constraints do in fact meet that constraint. With this disabled, repetitive `INSERT` statements are much faster, with the obvious risk of violating the constraint. For nonsense data that has been created with unique elements, this is safe to temporarily disable.
+  * Normally, the SQL engine will check that any columns declaring `UNIQUE` constraints do in fact meet that constraint. With this disabled, repetitive `INSERT` statements are much faster, with the obvious risk of violating the constraint. Since GenSQL by default does its own checks at creation for unique columns (currently limited to integer columns and `email` columns), this is generally safe to disable. If you use `--no-check`, this should not be disabled.
 * Multi-INSERT statements
-  * Normally, an `INSERT` statement might look something like `INSERT INTO $TABLE (col_1, col_2) VALUES (row_1, row_2);` Instead, they can be written like `INSERT INTO $TABLE (col_1, col_2) VALUES (row_1, row_2), (row_3, row_4),` with `n` tuples of row data. By default, `mysqld` (the server) is limited to a 64 MiB packet size, and `mysql` (the client) to a 16 MiB packet size. Both of these can be altered up to 1 GiB, but the server side may not be accessible to everyone, so GenSQL limits itself to a 10,000 row chunk size, which should comfortably fit under the server limit. For the client, you'll need to pass `--max-allowed-packet=67108864` as an arg.
+  * Normally, an `INSERT` statement might look something like `INSERT INTO $TABLE (col_1, col_2) VALUES (row_1, row_2);` Instead, they can be written like `INSERT INTO $TABLE (col_1, col_2) VALUES (row_1, row_2), (row_3, row_4),` with `n` tuples of row data. By default, `mysqld` (the server) is limited to a 64 MiB packet size, and `mysql` (the client) to a 16 MiB packet size. Both of these can be altered up to 1 GiB, but the server side may not be accessible to everyone, so GenSQL limits itself to a 10,000 row chunk size, which should comfortably fit under the server limit. For the client, you'll need to pass `--max-allowed-packet=67108864` as an arg. If you don't want this behavior, you can use `--no-chunk` when creating the data.
 
 
 Testing with inserting 100,000 rows (DB is backed by spinning disks):
@@ -147,55 +159,55 @@ Testing the creation of the standard 4-column schema, as well as an extended 8-c
 #### Python 3.11
 
 ```shell
-❯ time python3.11 create_entries.py -n 1000000 --force --drop-table
-python3.11 create_entries.py -n 1000000 --force --drop-table  4.56s user 0.16s system 99% cpu 4.744 total
+❯ time python3.11 gensql.py -n 1000000 --force --drop-table
+python3.11 gensql.py -n 1000000 --force --drop-table  4.56s user 0.16s system 99% cpu 4.744 total
 
-❯ time python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table
-python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table  12.70s user 1.13s system 98% cpu 14.089 total
+❯ time python3.11 gensql.py -i full.json -n 1000000 --force --drop-table
+python3.11 gensql.py -i full.json -n 1000000 --force --drop-table  12.70s user 1.13s system 98% cpu 14.089 total
 ```
 
 #### Python 3.10
 
 ```shell
-❯ time python3 create_entries.py -n 1000000 --force --drop-table
-python3 create_entries.py -n 1000000 --force --drop-table  5.27s user 0.17s system 99% cpu 5.442 total
+❯ time python3 gensql.py -n 1000000 --force --drop-table
+python3 gensql.py -n 1000000 --force --drop-table  5.27s user 0.17s system 99% cpu 5.442 total
 
-❯ time python3 create_entries.py -i full.json -n 1000000 --force --drop-table
-python3 create_entries.py -i full.json -n 1000000 --force --drop-table  16.23s user 0.54s system 99% cpu 16.840 total
+❯ time python3 gensql.py -i full.json -n 1000000 --force --drop-table
+python3 gensql.py -i full.json -n 1000000 --force --drop-table  16.23s user 0.54s system 99% cpu 16.840 total
 ```
 
 ### Intel i9  Macbook Pro
 
 #### Python 3.11
 
 ```shell
-❯ time python3.11 create_entries.py -n 1000000 --force --drop-table
-python3.11 create_entries.py -n 1000000 --force --drop-table  8.51s user 0.47s system 99% cpu 9.023 total
+❯ time python3.11 gensql.py -n 1000000 --force --drop-table
+python3.11 gensql.py -n 1000000 --force --drop-table  8.51s user 0.47s system 99% cpu 9.023 total
 
-❯ time python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table
-python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table  25.68s user 1.60s system 99% cpu 27.395 total
+❯ time python3.11 gensql.py -i full.json -n 1000000 --force --drop-table
+python3.11 gensql.py -i full.json -n 1000000 --force --drop-table  25.68s user 1.60s system 99% cpu 27.395 total
 ```
 
 #### Python 3.10
 
 ```shell
-❯ time python3 create_entries.py -n 1000000 --force --drop-table
-python3 create_entries.py -n 1000000 --force --drop-table  9.88s user 0.46s system 99% cpu 10.405 total
+❯ time python3 gensql.py -n 1000000 --force --drop-table
+python3 gensql.py -n 1000000 --force --drop-table  9.88s user 0.46s system 99% cpu 10.405 total
 
-❯ time python3 create_entries.py -i full.json -n 1000000 --force --drop-table
-python3 create_entries.py -i full.json -n 1000000 --force --drop-table  32.60s user 1.66s system 99% cpu 34.364 total
+❯ time python3 gensql.py -i full.json -n 1000000 --force --drop-table
+python3 gensql.py -i full.json -n 1000000 --force --drop-table  32.60s user 1.66s system 99% cpu 34.364 total
 ```
 
 ### Xeon E5-2650v2 server
 
 A ramdisk was used to eliminate the spinning disk overhead for the server.
 
 ```shell
-❯ time python3.11 create_entries.py -n 1000000 --force --drop-table -o /mnt/ramdisk/test.sql
-python3.11 create_entries.py -n 1000000 --force --drop-table -o   15.35s user 0.85s system 98% cpu 16.377 total
+❯ time python3.11 gensql.py -n 1000000 --force --drop-table -o /mnt/ramdisk/test.sql
+python3.11 gensql.py -n 1000000 --force --drop-table -o   15.35s user 0.85s system 98% cpu 16.377 total
 
-❯ time python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table -o /mnt/ramdisk/test.sql
-python3.11 create_entries.py -i full.json -n 1000000 --force --drop-table -o   45.26s user 3.79s system 99% cpu 49.072 total
+❯ time python3.11 gensql.py -i full.json -n 1000000 --force --drop-table -o /mnt/ramdisk/test.sql
+python3.11 gensql.py -i full.json -n 1000000 --force --drop-table -o   45.26s user 3.79s system 99% cpu 49.072 total
 ```
 
 ## TODO