From a2717f44d919d1d3e3e41c865c890d2e5b92ae85 Mon Sep 17 00:00:00 2001
From: Sean Anderson <seanga2@gmail.com>
Date: Thu, 22 Feb 2024 21:23:17 -0500
Subject: [PATCH] Add groupings column to leaderboard_cube

When players have never (for example) played in a league match, the cube
groupings with league included will have NULLs in the league column, just
like the groupings without league. This will cause these players' stats to
be overcounted when not filtering by league. To prevent this, add a
groupings column so we can know for sure which rows we should include in
the results.

In addition to fixing correctness problems, the groupings column also lets
us rework our indexes to take advantage of bloom filters. Unlike b-tree
indices, which are only efficient when filtering columns in-order, bloom
indices have no ordering preference, and filter just as well as long as
enough columns are being filtered by. The bloom index implementation in
postgres does not support excluding NULLs. However, by including the
groupings column in the index we can filter to the correct rows without
requiring NULL support.

Following the general outline of [1], the entropy in each of the filtering
columns is:

column   entropy
======== =======
league      0.40
formatid    2.20
classid     2.69
mapid       7.07
grouping    3.56

As the information stored in the default signature length of 80 bits is
6.322, we can use one bit for each column (slightly shortchanging the
mapid). This gives us a total number of set bits (I) of 80. Using the
formula for signature length (s_r) assuming 4K pages and a 4x random read
cost, we find the optimal signature length for a given number of filters
(Q) is:

I Q s_r
= = =====
5 1 809.5
5 2 169.4
5 3  75.8
5 4  46.6
5 5  33.6

This indicates we will support efficient querying with the default
signature length of 80 when we are filtering by at least 3 columns. We will
always filter by at least one columns (groupings), so the bloom index will
be efficient for querying on 2 or more columns. This means we need
more-efficient indices for the 1 column case. Fortunately, b-tree indices
are a great fit here. In the case where we aren't filtering on any columns,
we still want to filter by groupings, so we can use a b-tree index for that
as well. This indexing strategy roughly halves the index space, and should
be much more robust to arbitrary filter combinations.

[1] https://web.archive.org/web/20190201134134/https://blog.coelho.net/database/2016/12/11/postgresql-bloom-index.html

Fixes: 44be5a5 ("Optimize leaderboard")
Signed-off-by: Sean Anderson <seanga2@gmail.com>
---
 README.adoc                             |  6 +-
 test/conftest.py                        |  6 ++
 test/create.py                          | 11 ++--
 trends/bloom.sql                        |  9 +++
 trends/migrations/leaderboard_bloom.sql | 73 +++++++++++++++++++++++++
 trends/schema.sql                       | 48 ++++++++++++++--
 trends/site/root.py                     | 17 ++++--
 7 files changed, 153 insertions(+), 17 deletions(-)
 create mode 100644 trends/bloom.sql
 create mode 100644 trends/migrations/leaderboard_bloom.sql

diff --git a/README.adoc b/README.adoc
index 9887cfa..9b51c06 100644
--- a/README.adoc
+++ b/README.adoc
@@ -39,7 +39,11 @@ Verify that you can connect to the database by running
 
     $ psql -d trends
 
-You can exit the `psql` shell using the `\q` command.
+You can exit the `psql` shell using the `\q` command. Finally, before importing any data, run
+
+    $ sudo -u postgres psql -d trends -f trends/bloom.sql
+
+to enable the bloom index extension (which must be done as a superuser).
 
 === Importing data
 
diff --git a/test/conftest.py b/test/conftest.py
index cfafecb..24f6112 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,10 +3,12 @@
 
 from contextlib import contextmanager
 import logging
+import os
 
 import pytest
 from testing.postgresql import Postgresql
 
+import trends
 from trends.sql import db_connect
 from .create import create_test_db
 
@@ -22,6 +24,10 @@ def database(request):
     postgres_args = Postgresql.DEFAULT_SETTINGS['postgres_args']
     postgres_args += " -c full_page_writes=off"
     with Postgresql(postgres_args=postgres_args) as database:
+        with db_connect(database.url()) as c:
+            with open(f"{os.path.dirname(trends.__file__)}/bloom.sql") as bloom:
+                c.cursor().execute(bloom.read())
+
         with caplog_session(request) as caplog:
             with caplog.at_level(logging.ERROR):
                 create_test_db(database.url())
diff --git a/test/create.py b/test/create.py
index 8357d57..3ceacda 100755
--- a/test/create.py
+++ b/test/create.py
@@ -88,16 +88,15 @@ def create_test_db(url):
     with db_connect(url) as c:
         cur = c.cursor()
         cur.execute("ANALYZE;")
-        # A second time to test partitioning log_json
-        db_init(c)
-        cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
-        cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
-
-    with db_connect(url) as c:
         class args:
             since = datetime.fromtimestamp(0)
         trends.importer.link_demos.link_logs(args, c)
         trends.importer.link_matches.link_matches(args, c)
+        cur.execute("ANALYZE;")
+        # A second time to test partitioning log_json
+        db_init(c)
+        cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
+        cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
 
 if __name__ == '__main__':
     if len(sys.argv) != 2:
diff --git a/trends/bloom.sql b/trends/bloom.sql
new file mode 100644
index 0000000..1fce69a
--- /dev/null
+++ b/trends/bloom.sql
@@ -0,0 +1,9 @@
+CREATE EXTENSION IF NOT EXISTS bloom;
+
+DO $$ BEGIN
+	CREATE OPERATOR CLASS enum_ops DEFAULT FOR TYPE anyenum USING bloom AS
+		OPERATOR 1 =(anyenum, anyenum),
+		FUNCTION 1 hashenum(anyenum);
+EXCEPTION WHEN duplicate_object THEN
+	NULL;
+END $$;
diff --git a/trends/migrations/leaderboard_bloom.sql b/trends/migrations/leaderboard_bloom.sql
new file mode 100644
index 0000000..34186e0
--- /dev/null
+++ b/trends/migrations/leaderboard_bloom.sql
@@ -0,0 +1,73 @@
+BEGIN;
+
+DROP MATERIALIZED VIEW leaderboard_cube;
+CREATE MATERIALIZED VIEW leaderboard_cube AS SELECT
+	playerid,
+	league,
+	formatid,
+	primary_classid AS classid,
+	mapid,
+	grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
+	sum(log.duration) AS duration,
+	sum((wins > losses)::INT) AS wins,
+	sum((wins = losses)::INT) AS ties,
+	sum((wins < losses)::INT) AS losses,
+	sum(kills) AS kills,
+	sum(deaths) AS deaths,
+	sum(assists) AS assists,
+	sum(dmg) AS dmg,
+	sum(dt) AS dt,
+	sum(shots) AS shots,
+	sum(hits) AS hits
+FROM log_nodups AS log
+JOIN player_stats USING (logid)
+GROUP BY CUBE (playerid, league, formatid, classid, mapid)
+ORDER BY mapid, classid, formatid, playerid, league;
+
+-- To help out the query planner
+CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
+	ON league, formatid, classid, mapid, grouping
+	FROM leaderboard_cube;
+
+-- When we have no filters (or nothing better)
+CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
+
+-- When we have a single filter
+CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
+	WHERE playerid NOTNULL
+		AND league NOTNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid NOTNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid NOTNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid NOTNULL
+		AND grouping = b'01110'::INT;
+
+-- When we have multiple filters
+CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
+	USING bloom (grouping, mapid, classid, formatid, league)
+	WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
+	WHERE playerid NOTNULL;
+
+COMMIT;
+
+ANALYZE VERBOSE leaderboard_cube;
diff --git a/trends/schema.sql b/trends/schema.sql
index edfcbc2..fd1a17b 100644
--- a/trends/schema.sql
+++ b/trends/schema.sql
@@ -695,6 +695,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS leaderboard_cube AS SELECT
 	formatid,
 	primary_classid AS classid,
 	mapid,
+	grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
 	sum(log.duration) AS duration,
 	sum((wins > losses)::INT) AS wins,
 	sum((wins = losses)::INT) AS ties,
@@ -712,10 +713,49 @@ GROUP BY CUBE (playerid, league, formatid, classid, mapid)
 ORDER BY mapid, classid, formatid, playerid, league
 WITH NO DATA;
 
-CREATE UNIQUE INDEX IF NOT EXISTS leaderboard_pkey
-	ON leaderboard_cube (mapid, classid, formatid, playerid, league);
-
-CREATE INDEX IF NOT EXISTS leaderboard_classid ON leaderboard_cube (classid, formatid);
+-- To help out the query planner
+CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
+	ON league, formatid, classid, mapid, grouping
+	FROM leaderboard_cube;
+
+-- When we have no filters (or nothing better)
+CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
+
+-- When we have a single filter
+CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
+	WHERE playerid NOTNULL
+		AND league NOTNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid NOTNULL
+		AND classid ISNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid NOTNULL
+		AND mapid ISNULL
+		AND grouping = b'01110'::INT;
+CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
+	WHERE playerid NOTNULL
+		AND league ISNULL
+		AND formatid ISNULL
+		AND classid ISNULL
+		AND mapid NOTNULL
+		AND grouping = b'01110'::INT;
+
+-- When we have multiple filters
+CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
+	USING bloom (grouping, mapid, classid, formatid, league)
+	WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
+	WHERE playerid NOTNULL;
 
 CREATE TABLE IF NOT EXISTS weapon (
 	weaponid SERIAL PRIMARY KEY,
diff --git a/trends/site/root.py b/trends/site/root.py
index 85299dc..b131e96 100644
--- a/trends/site/root.py
+++ b/trends/site/root.py
@@ -74,11 +74,16 @@ def leaderboard():
 
     # Since we are using a cube, we need to explicitly select the NULL rows
     cube_clauses = []
-    for (name, column) in (('class', 'classid'), ('format', 'formatid'), ('map', 'mapid')):
+    grouping = 0b00000
+    for (name, column, group) in (
+            ('map',    'mapid',    0b00001),
+            ('class',  'classid',  0b00010),
+            ('format', 'formatid', 0b00100),
+            ('league', 'league',   0b01000),
+    ):
         if not filters[name]:
-            cube_clauses.append("AND {} ISNULL".format(column))
-    if not filters['league']:
-        cube_clauses.append("AND league ISNULL")
+            cube_clauses.append(f"AND {column} ISNULL")
+            grouping |= group
     cube_clauses = '\n'.join(cube_clauses)
 
     order, order_clause = get_order({
@@ -137,7 +142,7 @@ def leaderboard():
                                    sum(dmg) * 1.0 / nullif(sum(dt), 0) AS dr,
                                    sum(hits) * 1.0 / nullif(sum(shots), 0) AS acc
                                FROM leaderboard_cube
-                               WHERE playerid NOTNULL
+                               WHERE playerid NOTNULL AND grouping = %(grouping)s
                                    {}
                                    {}
                                GROUP BY playerid
@@ -147,7 +152,7 @@ def leaderboard():
                            LEFT JOIN player USING (playerid)
                            LEFT JOIN name USING (nameid);"""
                            .format(filter_clauses, cube_clauses, order_clause),
-                        { **filters, 'limit': limit, 'offset': offset })
+                        { **filters, 'grouping': grouping, 'limit': limit, 'offset': offset })
     resp = flask.make_response(flask.render_template("leaderboard.html",
                                leaderboard=leaderboard.fetchall()))
     resp.cache_control.max_age = 3600