Skip to content

Commit a2717f4

Browse files
committed
Add groupings column to leaderboard_cube
When players have never (for example) played in a league match, the cube groupings with league included will have NULLs in the league column, just like the groupings without league. This will cause these players' stats to be overcounted when not filtering by league. To prevent this, add a groupings column so we can know for sure which rows we should include in the results. In addition to fixing correctness problems, the groupings column also lets us rework our indexes to take advantage of bloom filters. Unlike b-tree indices, which are only efficient when filtering columns in-order, bloom indices have no ordering preference, and filter just as well as long as enough columns are being filtered by. The bloom index implementation in postgres does not support excluding NULLs. However, by including the groupings column in the index we can filter to the correct rows without requiring NULL support. Following the general outline of [1], the entropy in each of the filtering columns is: column entropy ======== ======= league 0.40 formatid 2.20 classid 2.69 mapid 7.07 grouping 3.56 As the information stored in the default signature length of 80 bits is 6.322, we can use one bit for each column (slightly shortchanging the mapid). This gives us a total number of set bits (I) of 80. Using the formula for signature length (s_r) assuming 4K pages and a 4x random read cost, we find the optimal signature length for a given number of filters (Q) is: I Q s_r = = ===== 5 1 809.5 5 2 169.4 5 3 75.8 5 4 46.6 5 5 33.6 This indicates we will support efficient querying with the default signature length of 80 when we are filtering by at least 3 columns. We will always filter by at least one columns (groupings), so the bloom index will be efficient for querying on 2 or more columns. This means we need more-efficient indices for the 1 column case. Fortunately, b-tree indices are a great fit here. In the case where we aren't filtering on any columns, we still want to filter by groupings, so we can use a b-tree index for that as well. This indexing strategy roughly halves the index space, and should be much more robust to arbitrary filter combinations. [1] https://web.archive.org/web/20190201134134/https://blog.coelho.net/database/2016/12/11/postgresql-bloom-index.html Fixes: 44be5a5 ("Optimize leaderboard") Signed-off-by: Sean Anderson <[email protected]>
1 parent 6738e9a commit a2717f4

File tree

7 files changed

+153
-17
lines changed

7 files changed

+153
-17
lines changed

README.adoc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ Verify that you can connect to the database by running
3939

4040
$ psql -d trends
4141

42-
You can exit the `psql` shell using the `\q` command.
42+
You can exit the `psql` shell using the `\q` command. Finally, before importing any data, run
43+
44+
$ sudo -u postgres psql -d trends -f trends/bloom.sql
45+
46+
to enable the bloom index extension (which must be done as a superuser).
4347

4448
=== Importing data
4549

test/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33

44
from contextlib import contextmanager
55
import logging
6+
import os
67

78
import pytest
89
from testing.postgresql import Postgresql
910

11+
import trends
1012
from trends.sql import db_connect
1113
from .create import create_test_db
1214

@@ -22,6 +24,10 @@ def database(request):
2224
postgres_args = Postgresql.DEFAULT_SETTINGS['postgres_args']
2325
postgres_args += " -c full_page_writes=off"
2426
with Postgresql(postgres_args=postgres_args) as database:
27+
with db_connect(database.url()) as c:
28+
with open(f"{os.path.dirname(trends.__file__)}/bloom.sql") as bloom:
29+
c.cursor().execute(bloom.read())
30+
2531
with caplog_session(request) as caplog:
2632
with caplog.at_level(logging.ERROR):
2733
create_test_db(database.url())

test/create.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,16 +88,15 @@ def create_test_db(url):
8888
with db_connect(url) as c:
8989
cur = c.cursor()
9090
cur.execute("ANALYZE;")
91-
# A second time to test partitioning log_json
92-
db_init(c)
93-
cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
94-
cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
95-
96-
with db_connect(url) as c:
9791
class args:
9892
since = datetime.fromtimestamp(0)
9993
trends.importer.link_demos.link_logs(args, c)
10094
trends.importer.link_matches.link_matches(args, c)
95+
cur.execute("ANALYZE;")
96+
# A second time to test partitioning log_json
97+
db_init(c)
98+
cur.execute("REFRESH MATERIALIZED VIEW leaderboard_cube;")
99+
cur.execute("REFRESH MATERIALIZED VIEW map_popularity;")
101100

102101
if __name__ == '__main__':
103102
if len(sys.argv) != 2:

trends/bloom.sql

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE EXTENSION IF NOT EXISTS bloom;
2+
3+
DO $$ BEGIN
4+
CREATE OPERATOR CLASS enum_ops DEFAULT FOR TYPE anyenum USING bloom AS
5+
OPERATOR 1 =(anyenum, anyenum),
6+
FUNCTION 1 hashenum(anyenum);
7+
EXCEPTION WHEN duplicate_object THEN
8+
NULL;
9+
END $$;
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
BEGIN;
2+
3+
DROP MATERIALIZED VIEW leaderboard_cube;
4+
CREATE MATERIALIZED VIEW leaderboard_cube AS SELECT
5+
playerid,
6+
league,
7+
formatid,
8+
primary_classid AS classid,
9+
mapid,
10+
grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
11+
sum(log.duration) AS duration,
12+
sum((wins > losses)::INT) AS wins,
13+
sum((wins = losses)::INT) AS ties,
14+
sum((wins < losses)::INT) AS losses,
15+
sum(kills) AS kills,
16+
sum(deaths) AS deaths,
17+
sum(assists) AS assists,
18+
sum(dmg) AS dmg,
19+
sum(dt) AS dt,
20+
sum(shots) AS shots,
21+
sum(hits) AS hits
22+
FROM log_nodups AS log
23+
JOIN player_stats USING (logid)
24+
GROUP BY CUBE (playerid, league, formatid, classid, mapid)
25+
ORDER BY mapid, classid, formatid, playerid, league;
26+
27+
-- To help out the query planner
28+
CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
29+
ON league, formatid, classid, mapid, grouping
30+
FROM leaderboard_cube;
31+
32+
-- When we have no filters (or nothing better)
33+
CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
34+
35+
-- When we have a single filter
36+
CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
37+
WHERE playerid NOTNULL
38+
AND league NOTNULL
39+
AND formatid ISNULL
40+
AND classid ISNULL
41+
AND mapid ISNULL
42+
AND grouping = b'01110'::INT;
43+
CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
44+
WHERE playerid NOTNULL
45+
AND league ISNULL
46+
AND formatid NOTNULL
47+
AND classid ISNULL
48+
AND mapid ISNULL
49+
AND grouping = b'01110'::INT;
50+
CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
51+
WHERE playerid NOTNULL
52+
AND league ISNULL
53+
AND formatid ISNULL
54+
AND classid NOTNULL
55+
AND mapid ISNULL
56+
AND grouping = b'01110'::INT;
57+
CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
58+
WHERE playerid NOTNULL
59+
AND league ISNULL
60+
AND formatid ISNULL
61+
AND classid ISNULL
62+
AND mapid NOTNULL
63+
AND grouping = b'01110'::INT;
64+
65+
-- When we have multiple filters
66+
CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
67+
USING bloom (grouping, mapid, classid, formatid, league)
68+
WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
69+
WHERE playerid NOTNULL;
70+
71+
COMMIT;
72+
73+
ANALYZE VERBOSE leaderboard_cube;

trends/schema.sql

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS leaderboard_cube AS SELECT
695695
formatid,
696696
primary_classid AS classid,
697697
mapid,
698+
grouping(playerid, league, formatid, primary_classid, mapid) AS grouping,
698699
sum(log.duration) AS duration,
699700
sum((wins > losses)::INT) AS wins,
700701
sum((wins = losses)::INT) AS ties,
@@ -712,10 +713,49 @@ GROUP BY CUBE (playerid, league, formatid, classid, mapid)
712713
ORDER BY mapid, classid, formatid, playerid, league
713714
WITH NO DATA;
714715

715-
CREATE UNIQUE INDEX IF NOT EXISTS leaderboard_pkey
716-
ON leaderboard_cube (mapid, classid, formatid, playerid, league);
717-
718-
CREATE INDEX IF NOT EXISTS leaderboard_classid ON leaderboard_cube (classid, formatid);
716+
-- To help out the query planner
717+
CREATE STATISTICS IF NOT EXISTS leaderboard_stats (dependencies, ndistinct, mcv)
718+
ON league, formatid, classid, mapid, grouping
719+
FROM leaderboard_cube;
720+
721+
-- When we have no filters (or nothing better)
722+
CREATE INDEX IF NOT EXISTS leaderboard_grouping ON leaderboard_cube (grouping);
723+
724+
-- When we have a single filter
725+
CREATE INDEX IF NOT EXISTS leaderboard_league ON leaderboard_cube (league)
726+
WHERE playerid NOTNULL
727+
AND league NOTNULL
728+
AND formatid ISNULL
729+
AND classid ISNULL
730+
AND mapid ISNULL
731+
AND grouping = b'01110'::INT;
732+
CREATE INDEX IF NOT EXISTS leaderboard_format ON leaderboard_cube (formatid)
733+
WHERE playerid NOTNULL
734+
AND league ISNULL
735+
AND formatid NOTNULL
736+
AND classid ISNULL
737+
AND mapid ISNULL
738+
AND grouping = b'01110'::INT;
739+
CREATE INDEX IF NOT EXISTS leaderboard_class ON leaderboard_cube (classid)
740+
WHERE playerid NOTNULL
741+
AND league ISNULL
742+
AND formatid ISNULL
743+
AND classid NOTNULL
744+
AND mapid ISNULL
745+
AND grouping = b'01110'::INT;
746+
CREATE INDEX IF NOT EXISTS leaderboard_map ON leaderboard_cube (mapid)
747+
WHERE playerid NOTNULL
748+
AND league ISNULL
749+
AND formatid ISNULL
750+
AND classid ISNULL
751+
AND mapid NOTNULL
752+
AND grouping = b'01110'::INT;
753+
754+
-- When we have multiple filters
755+
CREATE INDEX IF NOT EXISTS leaderboard_bloom ON leaderboard_cube
756+
USING bloom (grouping, mapid, classid, formatid, league)
757+
WITH (col1=1, col2=1, col3=1, col4=1, col5=1)
758+
WHERE playerid NOTNULL;
719759

720760
CREATE TABLE IF NOT EXISTS weapon (
721761
weaponid SERIAL PRIMARY KEY,

trends/site/root.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,16 @@ def leaderboard():
7474

7575
# Since we are using a cube, we need to explicitly select the NULL rows
7676
cube_clauses = []
77-
for (name, column) in (('class', 'classid'), ('format', 'formatid'), ('map', 'mapid')):
77+
grouping = 0b00000
78+
for (name, column, group) in (
79+
('map', 'mapid', 0b00001),
80+
('class', 'classid', 0b00010),
81+
('format', 'formatid', 0b00100),
82+
('league', 'league', 0b01000),
83+
):
7884
if not filters[name]:
79-
cube_clauses.append("AND {} ISNULL".format(column))
80-
if not filters['league']:
81-
cube_clauses.append("AND league ISNULL")
85+
cube_clauses.append(f"AND {column} ISNULL")
86+
grouping |= group
8287
cube_clauses = '\n'.join(cube_clauses)
8388

8489
order, order_clause = get_order({
@@ -137,7 +142,7 @@ def leaderboard():
137142
sum(dmg) * 1.0 / nullif(sum(dt), 0) AS dr,
138143
sum(hits) * 1.0 / nullif(sum(shots), 0) AS acc
139144
FROM leaderboard_cube
140-
WHERE playerid NOTNULL
145+
WHERE playerid NOTNULL AND grouping = %(grouping)s
141146
{}
142147
{}
143148
GROUP BY playerid
@@ -147,7 +152,7 @@ def leaderboard():
147152
LEFT JOIN player USING (playerid)
148153
LEFT JOIN name USING (nameid);"""
149154
.format(filter_clauses, cube_clauses, order_clause),
150-
{ **filters, 'limit': limit, 'offset': offset })
155+
{ **filters, 'grouping': grouping, 'limit': limit, 'offset': offset })
151156
resp = flask.make_response(flask.render_template("leaderboard.html",
152157
leaderboard=leaderboard.fetchall()))
153158
resp.cache_control.max_age = 3600

0 commit comments

Comments
 (0)