Add groupings column to leaderboard_cube

Forty-Bot · Forty-Bot · commit a2717f44d919 · 2024-02-22T21:59:36.000-05:00
When players have never (for example) played in a league match, the cube groupings with league included will have NULLs in the league column, just like the groupings without league. This will cause these players' stats to be overcounted when not filtering by league. To prevent this, add a groupings column so we can know for sure which rows we should include in the results. In addition to fixing correctness problems, the groupings column also lets us rework our indexes to take advantage of bloom filters. Unlike b-tree indices, which are only efficient when filtering columns in-order, bloom indices have no ordering preference, and filter just as well as long as enough columns are being filtered by. The bloom index implementation in postgres does not support excluding NULLs. However, by including the groupings column in the index we can filter to the correct rows without requiring NULL support. Following the general outline of [1], the entropy in each of the filtering columns is: column entropy ======== ======= league 0.40 formatid 2.20 classid 2.69 mapid 7.07 grouping 3.56 As the information stored in the default signature length of 80 bits is 6.322, we can use one bit for each column (slightly shortchanging the mapid). This gives us a total number of set bits (I) of 80. Using the formula for signature length (s_r) assuming 4K pages and a 4x random read cost, we find the optimal signature length for a given number of filters (Q) is: I Q s_r = = ===== 5 1 809.5 5 2 169.4 5 3 75.8 5 4 46.6 5 5 33.6 This indicates we will support efficient querying with the default signature length of 80 when we are filtering by at least 3 columns. We will always filter by at least one columns (groupings), so the bloom index will be efficient for querying on 2 or more columns. This means we need more-efficient indices for the 1 column case. Fortunately, b-tree indices are a great fit here. In the case where we aren't filtering on any columns, we still want to filter by groupings, so we can use a b-tree index for that as well. This indexing strategy roughly halves the index space, and should be much more robust to arbitrary filter combinations. [1] https://web.archive.org/web/20190201134134/https://blog.coelho.net/database/2016/12/11/postgresql-bloom-index.html Fixes: 44be5a5 ("Optimize leaderboard") Signed-off-by: Sean Anderson <seanga2@gmail.com>
diff --git a/README.adoc b/README.adoc
@@ -39,7 +39,11 @@ Verify that you can connect to the database by running
 
     $ psql -d trends
 
-You can exit the `psql` shell using the `\q` command.
+You can exit the `psql` shell using the `\q` command. Finally, before importing any data, run
+
+    $ sudo -u postgres psql -d trends -f trends/bloom.sql
+
+to enable the bloom index extension (which must be done as a superuser).
 
 === Importing data
 
diff --git a/test/conftest.py b/test/conftest.py
@@ -3,10 +3,12 @@
 
 from contextlib import contextmanager
 import logging
+import os
 
 import pytest
 from testing.postgresql import Postgresql
 
+import trends
 from trends.sql import db_connect
 from .create import create_test_db
 
@@ -22,6 +24,10 @@ def database(request):
     postgres_args = Postgresql.DEFAULT_SETTINGS['postgres_args']
     postgres_args += " -c full_page_writes=off"
     with Postgresql(postgres_args=postgres_args) as database:
+        with db_connect(database.url()) as c:
+            with open(f"{os.path.dirname(trends.__file__)}/bloom.sql") as bloom:
+                c.cursor().execute(bloom.read())
+
         with caplog_session(request) as caplog:
             with caplog.at_level(logging.ERROR):
                 create_test_db(database.url())
diff --git a/test/create.py b/test/create.py
@@ -88,16 +88,15 @@ def create_test_db(url):
     with db_connect(url) as c:
         cur = c.cursor()
         cur.execute("ANALYZE;")
-        # A second time to test partitioning log_json
-        db_init(c)
-        cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
-        cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
-
-    with db_connect(url) as c:
         class args:
             since = datetime.fromtimestamp(0)
         trends.importer.link_demos.link_logs(args, c)
         trends.importer.link_matches.link_matches(args, c)
+        cur.execute("ANALYZE;")
+        # A second time to test partitioning log_json
+        db_init(c)
+        cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
+        cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
 
 if __name__ == '__main__':
     if len(sys.argv) != 2:
diff --git a/trends/bloom.sql b/trends/bloom.sql
@@ -0,0 +1,9 @@
+CREATE EXTENSION IF NOT EXISTS bloom;
+
+DO $$ BEGIN
+	CREATE OPERATOR CLASS enum_ops DEFAULT FOR TYPE anyenum USING bloom AS
+		OPERATOR 1 =(anyenum, anyenum),
+		FUNCTION 1 hashenum(anyenum);
+EXCEPTION WHEN duplicate_object THEN
+	NULL;
+END $$;
diff --git a/trends/migrations/leaderboard_bloom.sql b/trends/migrations/leaderboard_bloom.sql
@@ -0,0 +1,73 @@
+BEGIN;
+
+DROP MATERIALIZED VIEW leaderboard_cube;
+CREATE MATERIALIZED VIEW leaderboard_cube AS SELECT
+	playerid,
+	league,
+	formatid,
+	primary_classid AS classid,
+	mapid,
+	grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
+	sum(log.duration) AS duration,
+	sum((wins > losses)::INT) AS wins,
+	sum((wins = losses)::INT) AS ties,
+	sum((wins < losses)::INT) AS losses,
+	sum(kills) AS kills,
+	sum(deaths) AS deaths,
+	sum(assists) AS assists,
+	sum(dmg) AS dmg,
+	sum(dt) AS dt,
+	sum(shots) AS shots,
+	sum(hits) AS hits
+FROM log_nodups AS log
+JOIN player_stats USING (logid)
+GROUP BY CUBE (playerid, league, formatid, classid, mapid)
+ORDER BY mapid, classid, formatid, playerid, league;
+
+-- To help out the query planner
+CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
+	ON league, formatid, classid, mapid, grouping
+	FROM leaderboard_cube;
+
+-- When we have no filters (or nothing better)
+CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
+
+-- When we have a single filter
+CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
+	WHERE playerid NOTNULL
+		AND league NOTNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid NOTNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid NOTNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid NOTNULL
+		AND grouping = b'01110'::INT;
+
+-- When we have multiple filters
+CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
+	USING bloom (grouping, mapid, classid, formatid, league)
+	WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
+	WHERE playerid NOTNULL;
+
+COMMIT;
+
+ANALYZE VERBOSE leaderboard_cube;
diff --git a/trends/schema.sql b/trends/schema.sql
@@ -695,6 +695,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS leaderboard_cube AS SELECT
 	formatid,
 	primary_classid AS classid,
 	mapid,
+	grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
 	sum(log.duration) AS duration,
 	sum((wins > losses)::INT) AS wins,
 	sum((wins = losses)::INT) AS ties,
@@ -712,10 +713,49 @@ GROUP BY CUBE (playerid, league, formatid, classid, mapid)
 ORDER BY mapid, classid, formatid, playerid, league
 WITH NO DATA;
 
-CREATE UNIQUE INDEX IF NOT EXISTS leaderboard_pkey
-	ON leaderboard_cube (mapid, classid, formatid, playerid, league);
-
-CREATE INDEX IF NOT EXISTS leaderboard_classid ON leaderboard_cube (classid, formatid);
+-- To help out the query planner
+CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
+	ON league, formatid, classid, mapid, grouping
+	FROM leaderboard_cube;
+
+-- When we have no filters (or nothing better)
+CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
+
+-- When we have a single filter
+CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
+	WHERE playerid NOTNULL
+		AND league NOTNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid NOTNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid NOTNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid NOTNULL
+		AND grouping = b'01110'::INT;
+
+-- When we have multiple filters
+CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
+	USING bloom (grouping, mapid, classid, formatid, league)
+	WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
+	WHERE playerid NOTNULL;
 
 CREATE TABLE IF NOT EXISTS weapon (
 	weaponid SERIAL PRIMARY KEY,
diff --git a/trends/site/root.py b/trends/site/root.py
@@ -74,11 +74,16 @@ def leaderboard():
 
     # Since we are using a cube, we need to explicitly select the NULL rows
     cube_clauses = []
-    for (name, column) in (('class', 'classid'), ('format', 'formatid'), ('map', 'mapid')):
+    grouping = 0b00000
+    for (name, column, group) in (
+            ('map',    'mapid',    0b00001),
+            ('class',  'classid',  0b00010),
+            ('format', 'formatid', 0b00100),
+            ('league', 'league',   0b01000),
+    ):
         if not filters[name]:
-            cube_clauses.append("AND {} ISNULL".format(column))
-    if not filters['league']:
-        cube_clauses.append("AND league ISNULL")
+            cube_clauses.append(f"AND {column} ISNULL")
+            grouping |= group
     cube_clauses = '\n'.join(cube_clauses)
 
     order, order_clause = get_order({
@@ -137,7 +142,7 @@ def leaderboard():
                                    sum(dmg) * 1.0 / nullif(sum(dt), 0) AS dr,
                                    sum(hits) * 1.0 / nullif(sum(shots), 0) AS acc
                                FROM leaderboard_cube
-                               WHERE playerid NOTNULL
+                               WHERE playerid NOTNULL AND grouping = %(grouping)s
                                    {}
                                    {}
                                GROUP BY playerid
@@ -147,7 +152,7 @@ def leaderboard():
                            LEFT JOIN player USING (playerid)
                            LEFT JOIN name USING (nameid);"""
                            .format(filter_clauses, cube_clauses, order_clause),
-                        { **filters, 'limit': limit, 'offset': offset })
+                        { **filters, 'grouping': grouping, 'limit': limit, 'offset': offset })
     resp = flask.make_response(flask.render_template("leaderboard.html",
                                leaderboard=leaderboard.fetchall()))
     resp.cache_control.max_age = 3600