From a2717f44d919d1d3e3e41c865c890d2e5b92ae85 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 22 Feb 2024 21:23:17 -0500 Subject: [PATCH] Add groupings column to leaderboard_cube When players have never (for example) played in a league match, the cube groupings with league included will have NULLs in the league column, just like the groupings without league. This will cause these players' stats to be overcounted when not filtering by league. To prevent this, add a groupings column so we can know for sure which rows we should include in the results. In addition to fixing correctness problems, the groupings column also lets us rework our indexes to take advantage of bloom filters. Unlike b-tree indices, which are only efficient when filtering columns in-order, bloom indices have no ordering preference, and filter just as well as long as enough columns are being filtered by. The bloom index implementation in postgres does not support excluding NULLs. However, by including the groupings column in the index we can filter to the correct rows without requiring NULL support. Following the general outline of [1], the entropy in each of the filtering columns is: column entropy ======== ======= league 0.40 formatid 2.20 classid 2.69 mapid 7.07 grouping 3.56 As the information stored in the default signature length of 80 bits is 6.322, we can use one bit for each column (slightly shortchanging the mapid). This gives us a total number of set bits (I) of 80. Using the formula for signature length (s_r) assuming 4K pages and a 4x random read cost, we find the optimal signature length for a given number of filters (Q) is: I Q s_r = = ===== 5 1 809.5 5 2 169.4 5 3 75.8 5 4 46.6 5 5 33.6 This indicates we will support efficient querying with the default signature length of 80 when we are filtering by at least 3 columns. We will always filter by at least one columns (groupings), so the bloom index will be efficient for querying on 2 or more columns. This means we need more-efficient indices for the 1 column case. Fortunately, b-tree indices are a great fit here. In the case where we aren't filtering on any columns, we still want to filter by groupings, so we can use a b-tree index for that as well. This indexing strategy roughly halves the index space, and should be much more robust to arbitrary filter combinations. [1] https://web.archive.org/web/20190201134134/https://blog.coelho.net/database/2016/12/11/postgresql-bloom-index.html Fixes: 44be5a5 ("Optimize leaderboard") Signed-off-by: Sean Anderson --- README.adoc | 6 +- test/conftest.py | 6 ++ test/create.py | 11 ++-- trends/bloom.sql | 9 +++ trends/migrations/leaderboard_bloom.sql | 73 +++++++++++++++++++++++++ trends/schema.sql | 48 ++++++++++++++-- trends/site/root.py | 17 ++++-- 7 files changed, 153 insertions(+), 17 deletions(-) create mode 100644 trends/bloom.sql create mode 100644 trends/migrations/leaderboard_bloom.sql diff --git a/README.adoc b/README.adoc index 9887cfa..9b51c06 100644 --- a/README.adoc +++ b/README.adoc @@ -39,7 +39,11 @@ Verify that you can connect to the database by running $ psql -d trends -You can exit the `psql` shell using the `\q` command. +You can exit the `psql` shell using the `\q` command. Finally, before importing any data, run + + $ sudo -u postgres psql -d trends -f trends/bloom.sql + +to enable the bloom index extension (which must be done as a superuser). === Importing data diff --git a/test/conftest.py b/test/conftest.py index cfafecb..24f6112 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,10 +3,12 @@ from contextlib import contextmanager import logging +import os import pytest from testing.postgresql import Postgresql +import trends from trends.sql import db_connect from .create import create_test_db @@ -22,6 +24,10 @@ def database(request): postgres_args = Postgresql.DEFAULT_SETTINGS['postgres_args'] postgres_args += " -c full_page_writes=off" with Postgresql(postgres_args=postgres_args) as database: + with db_connect(database.url()) as c: + with open(f"{os.path.dirname(trends.__file__)}/bloom.sql") as bloom: + c.cursor().execute(bloom.read()) + with caplog_session(request) as caplog: with caplog.at_level(logging.ERROR): create_test_db(database.url()) diff --git a/test/create.py b/test/create.py index 8357d57..3ceacda 100755 --- a/test/create.py +++ b/test/create.py @@ -88,16 +88,15 @@ def create_test_db(url): with db_connect(url) as c: cur = c.cursor() cur.execute("ANALYZE;") - # A second time to test partitioning log_json - db_init(c) - cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;") - cur.execute("REFRESH MATERIALIZED VIEW map_popularity;") - - with db_connect(url) as c: class args: since = datetime.fromtimestamp(0) trends.importer.link_demos.link_logs(args, c) trends.importer.link_matches.link_matches(args, c) + cur.execute("ANALYZE;") + # A second time to test partitioning log_json + db_init(c) + cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;") + cur.execute("REFRESH MATERIALIZED VIEW map_popularity;") if __name__ == '__main__': if len(sys.argv) != 2: diff --git a/trends/bloom.sql b/trends/bloom.sql new file mode 100644 index 0000000..1fce69a --- /dev/null +++ b/trends/bloom.sql @@ -0,0 +1,9 @@ +CREATE EXTENSION IF NOT EXISTS bloom; + +DO $$ BEGIN + CREATE OPERATOR CLASS enum_ops DEFAULT FOR TYPE anyenum USING bloom AS + OPERATOR 1 =(anyenum, anyenum), + FUNCTION 1 hashenum(anyenum); +EXCEPTION WHEN duplicate_object THEN + NULL; +END $$; diff --git a/trends/migrations/leaderboard_bloom.sql b/trends/migrations/leaderboard_bloom.sql new file mode 100644 index 0000000..34186e0 --- /dev/null +++ b/trends/migrations/leaderboard_bloom.sql @@ -0,0 +1,73 @@ +BEGIN; + +DROP MATERIALIZED VIEW leaderboard_cube; +CREATE MATERIALIZED VIEW leaderboard_cube AS SELECT + playerid, + league, + formatid, + primary_classid AS classid, + mapid, + grouping(playerid, league, formatid, primary_classid, mapid) AS grouping, + sum(log.duration) AS duration, + sum((wins > losses)::INT) AS wins, + sum((wins = losses)::INT) AS ties, + sum((wins < losses)::INT) AS losses, + sum(kills) AS kills, + sum(deaths) AS deaths, + sum(assists) AS assists, + sum(dmg) AS dmg, + sum(dt) AS dt, + sum(shots) AS shots, + sum(hits) AS hits +FROM log_nodups AS log +JOIN player_stats USING (logid) +GROUP BY CUBE (playerid, league, formatid, classid, mapid) +ORDER BY mapid, classid, formatid, playerid, league; + +-- To help out the query planner +CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv) + ON league, formatid, classid, mapid, grouping + FROM leaderboard_cube; + +-- When we have no filters (or nothing better) +CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping); + +-- When we have a single filter +CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league) + WHERE playerid NOTNULL + AND league NOTNULL + AND formatid ISNULL + AND classid ISNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid NOTNULL + AND classid ISNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid ISNULL + AND classid NOTNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid ISNULL + AND classid ISNULL + AND mapid NOTNULL + AND grouping = b'01110'::INT; + +-- When we have multiple filters +CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube + USING bloom (grouping, mapid, classid, formatid, league) + WITH (col1=1, col2=1, col3=1, col4=1, col5=1) + WHERE playerid NOTNULL; + +COMMIT; + +ANALYZE VERBOSE leaderboard_cube; diff --git a/trends/schema.sql b/trends/schema.sql index edfcbc2..fd1a17b 100644 --- a/trends/schema.sql +++ b/trends/schema.sql @@ -695,6 +695,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS leaderboard_cube AS SELECT formatid, primary_classid AS classid, mapid, + grouping(playerid, league, formatid, primary_classid, mapid) AS grouping, sum(log.duration) AS duration, sum((wins > losses)::INT) AS wins, sum((wins = losses)::INT) AS ties, @@ -712,10 +713,49 @@ GROUP BY CUBE (playerid, league, formatid, classid, mapid) ORDER BY mapid, classid, formatid, playerid, league WITH NO DATA; -CREATE UNIQUE INDEX IF NOT EXISTS leaderboard_pkey - ON leaderboard_cube (mapid, classid, formatid, playerid, league); - -CREATE INDEX IF NOT EXISTS leaderboard_classid ON leaderboard_cube (classid, formatid); +-- To help out the query planner +CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv) + ON league, formatid, classid, mapid, grouping + FROM leaderboard_cube; + +-- When we have no filters (or nothing better) +CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping); + +-- When we have a single filter +CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league) + WHERE playerid NOTNULL + AND league NOTNULL + AND formatid ISNULL + AND classid ISNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid NOTNULL + AND classid ISNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid ISNULL + AND classid NOTNULL + AND mapid ISNULL + AND grouping = b'01110'::INT; +CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid) + WHERE playerid NOTNULL + AND league ISNULL + AND formatid ISNULL + AND classid ISNULL + AND mapid NOTNULL + AND grouping = b'01110'::INT; + +-- When we have multiple filters +CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube + USING bloom (grouping, mapid, classid, formatid, league) + WITH (col1=1, col2=1, col3=1, col4=1, col5=1) + WHERE playerid NOTNULL; CREATE TABLE IF NOT EXISTS weapon ( weaponid SERIAL PRIMARY KEY, diff --git a/trends/site/root.py b/trends/site/root.py index 85299dc..b131e96 100644 --- a/trends/site/root.py +++ b/trends/site/root.py @@ -74,11 +74,16 @@ def leaderboard(): # Since we are using a cube, we need to explicitly select the NULL rows cube_clauses = [] - for (name, column) in (('class', 'classid'), ('format', 'formatid'), ('map', 'mapid')): + grouping = 0b00000 + for (name, column, group) in ( + ('map', 'mapid', 0b00001), + ('class', 'classid', 0b00010), + ('format', 'formatid', 0b00100), + ('league', 'league', 0b01000), + ): if not filters[name]: - cube_clauses.append("AND {} ISNULL".format(column)) - if not filters['league']: - cube_clauses.append("AND league ISNULL") + cube_clauses.append(f"AND {column} ISNULL") + grouping |= group cube_clauses = '\n'.join(cube_clauses) order, order_clause = get_order({ @@ -137,7 +142,7 @@ def leaderboard(): sum(dmg) * 1.0 / nullif(sum(dt), 0) AS dr, sum(hits) * 1.0 / nullif(sum(shots), 0) AS acc FROM leaderboard_cube - WHERE playerid NOTNULL + WHERE playerid NOTNULL AND grouping = %(grouping)s {} {} GROUP BY playerid @@ -147,7 +152,7 @@ def leaderboard(): LEFT JOIN player USING (playerid) LEFT JOIN name USING (nameid);""" .format(filter_clauses, cube_clauses, order_clause), - { **filters, 'limit': limit, 'offset': offset }) + { **filters, 'grouping': grouping, 'limit': limit, 'offset': offset }) resp = flask.make_response(flask.render_template("leaderboard.html", leaderboard=leaderboard.fetchall())) resp.cache_control.max_age = 3600