-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreproduction.py
executable file
·2213 lines (1787 loc) · 77.2 KB
/
reproduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/env python3
"""
Run all programs required to reproduce the thesis' results
Copyright (c) 2024 Christoph Ullinger <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from pathlib import Path
from typing import Callable, Iterator, Optional, Sequence, Any, TypedDict
from dataclasses import dataclass, field, is_dataclass, \
asdict as dataclass_asdict
from time import time_ns, sleep
from enum import Enum
from statistics import mean, stdev, quantiles, median
import argparse
from shlex import quote
import json
import logging
import subprocess
import os
import re
import csv
from zipfile import ZipFile
PROGRAM_DESCRIPTION = """
\"Efficient Spatial Search for the QLever SPARQL engine\"
This program runs all steps required to reproduce the thesis' evaluation and
case study results.
"""
PROGRAM_DIR = Path(__file__).parent.resolve()
SUBPROCESS_TIMEOUT = 24 * 60 * 60 # 24h
VERBOSE = False
REQUIRED_ON_PATH = [
"bash", "grep", "curl", "tar", "gunzip", "pwd", "mkdir", "df", "cat",
"bzcat", "tail", "tee", "git", "time", "wc", "find", "rm", "chmod",
"podman", "qlever"
]
SHELL = "/bin/bash"
# Bytes per Mebi/Gibibyte shorthands
MIB = 2 ** 20
GIB = 2 ** 30
FREE_SPACE_OUTPUT = 1024 * GIB
FREE_SPACE_CONTAINERS = 500 * GIB
FREE_MEMORY = 100 * GIB
# Nanoseconds per Milisecond shorthand
NS_MS = 10 ** 6
# Settings to bypass some steps
BYPASS_MEMORY_CHECK = False
BYPASS_DISK_CHECK = False
BYPASS_CONTAINER_BUILDS = False
BYPASS_GET_DATA: list[str] = []
BYPASS_INDEX_BUILD: list[str] = []
BYPASS_QUERY: list[str] = []
BYPASS_CASE_STUDY_QUERY = False
QLEVER_IMAGE = "docker.io/adfreiburg/qlever"
QLEVER_NO_GEOPOINT_HASH = "6384041460e62e6088fc86b5c5190e51cd986372"
QLEVER_NEW_HASH = "01d83064a97fa51925f7be3cc89a02acf2a20c02"
QLEVER_INSTANCES: dict[str, int] = {
# directory/qleverfile name => port
"osm-de": 7925,
"election": 7926,
"gtfs": 7928,
"gtfs-no-geopoint": 7931
}
QLEVER_INDEX_FILES_GLOB = [
"*.index.*", "*-log.txt", "*.meta-data.json",
"*.settings.json", "*.vocabulary.*"
]
QLEVER_CONTROL_URL = "https://github.com/ad-freiburg/qlever-control.git"
QLEVER_ONLY_BASELINE_NOT_CARTESIAN = True
OSM2RDF_REPO = "https://github.com/ad-freiburg/osm2rdf.git"
OSM2RDF_COMMIT_HASH = "cf015f76bb9d257fbfa5eefbd31b828fbe75baaa"
OSM_GERMANY_PBF = "https://download.geofabrik.de/europe/germany-latest.osm.pbf"
OUTPUT_SUBDIRS: list[Path] = [Path(p) for p in QLEVER_INSTANCES.keys()] + \
[Path("results"), Path("log")]
# Example OpenStreetMap tags for filtering subsets of OSM data of different
# sizes for evaluation steps
OsmTestTag = tuple[str, str, int] # Tag key, Value, Approx.Size (OSM DE)
OSM_TEST_TAGS: list[OsmTestTag] = [
("leisure", "sauna", 1_500),
("railway", "station", 4_500),
("tourism", "viewpoint", 29_500),
("shop", "supermarket", 34_000),
("amenity", "restaurant", 102_000),
("amenity", "bench", 707_000),
("building", "*", 37_000_000)
]
# Example GTFS feeds and approximate number of gtfs:Stop entities
GTFS_TEST_FEEDS: dict[str, int] = {
"delfi_20210924": 480_000,
"delfi_20240603": 540_000,
"vag_2024": 1_000,
"fintraffic_2024": 1_500
}
# Maximum number of point combinations where we can reasonably still calculate
# a cartesian product
CARTESIAN_LIMIT = 250_000_000
# How often to run evaluation queries for more reliable execution times
QUERY_ITERATIONS = 10
# If an individual query in a benchmark takes longer than this amount of
# miliseconds, skip further iterations of the same query to reduce the overall
# running time of the evaluation -> this means that a single benchmark may take
# in the worst case approximately CANCEL_ITERATIONS_IF_LONGER_THAN *
# QUERY_ITERATIONS time (= ca. 1,7 h)
CANCEL_ITERATIONS_IF_LONGER_THAN = 10 * 60 * 1_000 # 10 min
# Sizes and filenames for checking output in "get_data" steps
ExpectedSize = tuple[str, int, int] # Filename, Min, Max
ELECTION_EXPECTED_SIZE: list[ExpectedSize] = [
(f, MIB // 2, 5 * MIB)
for f in ("btw21.ttl.bz2", "btw21-aux-geo.tsv", "ew24.ttl.bz2")
]
OSM_DE_EXPECTED_SIZE: ExpectedSize = (
"osm-germany.ttl.bz2", 50 * GIB, 200 * GIB)
GTFS_EXPECTED_SIZE: list[ExpectedSize] = [
("delfi21.ttl.bz2", 500 * MIB, 10 * GIB),
("delfi24.ttl.bz2", 500 * MIB, 10 * GIB),
("vag24.ttl.bz2", 10 * MIB, 100 * MIB),
("fintraffic24.ttl.bz2", 20 * MIB, 500 * MIB)
]
PG_EXPORT_WKT_SIZE: ExpectedSize = ("geometries.tsv", 10 * GIB, 50 * GIB)
PG_IMPORT_SPATIAL_REL_SIZE: ExpectedSize = (
"geometries_spatialjoin.csv", 50 * GIB, 150 * GIB)
OSM_GERMANY_PBF_SIZE: ExpectedSize = (
"osm-germany.pbf", 2 * GIB, 10 * GIB
)
# Filenames for extraction of zipped reproduction.py
EXTRACT_PROG_TMP_NAME = "temp_reproduction_code.zip"
EXTRACT_PROG_DIR = "program_dir"
# Allows skipping PostgreSQL queries that most likely won't finish in 24 hours
# based on experience from running them previously
POSTGRES_SKIP_24H_QUERIES = True
POSTGRES_24H_QUERIES: list[tuple[str, str, str, str]] = [
("building", "*", "railway", "station"),
("building", "*", "tourism", "viewpoint"),
]
# Don't apply CARTESIAN_LIMIT to GTFS baseline max distance search
GTFS_MAX_DIST_CARTESIAN_ALL = False
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
class AnsiEsc(Enum):
"Colors for fancy terminal output"
RED = '\x1b[31m'
GREEN = '\x1b[32m'
YELLOW = '\x1b[33m'
BLUE = '\x1b[34m'
PINK = '\x1b[35m'
NORMAL = '\x1b[0m'
BOLD = '\x1b[1m'
REVERSED = '\x1b[7m'
def _str(args: tuple[Any, ...]) -> str:
"Helper to prepare for logging"
return " ".join(str(a) for a in args)
def log_error(*args):
logger.error(AnsiEsc.RED.value + _str(args) + AnsiEsc.NORMAL.value)
def log_warning(*args):
logger.warning(AnsiEsc.YELLOW.value + _str(args) + AnsiEsc.NORMAL.value)
def log_success(*args):
logger.info(AnsiEsc.GREEN.value + _str(args) + AnsiEsc.NORMAL.value)
def log_command(*args):
logger.info(AnsiEsc.BLUE.value + _str(args) + AnsiEsc.NORMAL.value)
def log_info(*args):
logger.info(_str(args))
def log_important(*args):
logger.info(AnsiEsc.BOLD.value + AnsiEsc.REVERSED.value +
AnsiEsc.BLUE.value + _str(args) + AnsiEsc.NORMAL.value)
class EnhancedJSONEncoder(json.JSONEncoder):
"""
Allows encoding data containing dataclasses and pathlib.Path objects.
Adapted from <https://stackoverflow.com/a/51286749>.
"""
def default(self, o: Any) -> Any:
if is_dataclass(o) and not isinstance(o, type):
return dataclass_asdict(o)
if isinstance(o, Path):
return str(o)
return super().default(o)
@dataclass
class InternalStep:
success: bool = False
stdout: str = ""
stderr: str = ""
identifier: Optional[str] = None
@property
def returncode(self):
return 0 if self.success else 1
@dataclass
class SimpleSubproc:
command: str
cwd: Optional[Path]
log_path: Optional[Path]
returncode: int
stdout: str
stderr: str
start: int
end: int
identifier: Optional[str] = None
@staticmethod
def run(command: str, cwd: Optional[Path] = None,
fwd_output: bool | str | Path = False) -> 'SimpleSubproc':
log_command(command)
log_path: Optional[Path] = None
match fwd_output:
case str() | Path():
log_path = Path(fwd_output).resolve()
log_info("The command's output is being piped to a file. " +
f"Use 'tail -f {log_path}' to observe the log.")
with open(log_path, "a") as log_file:
start = time_ns()
proc = subprocess.run(command, shell=True,
executable=SHELL, cwd=cwd,
timeout=SUBPROCESS_TIMEOUT,
stderr=subprocess.STDOUT,
stdout=log_file)
end = time_ns()
case bool():
start = time_ns()
proc = subprocess.run(command, shell=True,
executable=SHELL,
cwd=cwd, timeout=SUBPROCESS_TIMEOUT,
capture_output=(not fwd_output))
end = time_ns()
if VERBOSE:
log_info("Command took",
round((end - start) / NS_MS), "ms")
return SimpleSubproc(
command,
cwd,
log_path,
proc.returncode,
"" if fwd_output else proc.stdout.decode("utf-8"),
"" if fwd_output else proc.stderr.decode("utf-8"),
start, end)
@property
def success(self) -> bool:
return self.returncode == 0
@property
def time(self) -> float:
"Running time in milliseconds"
return (self.end - self.start) / 1000
SubResult = SimpleSubproc | InternalStep
@dataclass
class StepResult:
subresults: list[SubResult] = field(
default_factory=list)
@property
def success(self) -> bool:
return all(i.success for i in self.subresults)
def add(self, res: SubResult):
self.subresults.append(res)
Results = dict[str, list[StepResult]]
SubResGen = Iterator[SubResult]
Step = Callable[[Results, Path], SubResGen]
Steps = list[Step]
IndexSizes = TypedDict("IndexSizes", {
"postgres": dict[str, int],
"qlever": dict[str, int]
})
###############################################################################
def dep(prev: Results, *args: Step):
"Helper: Assert that steps have already been run successfully"
for step in args:
name = step.__name__
assert name in prev, f"Step requires '{name}' to be run first"
for p in prev[name]:
assert p.success, \
f"This step requires the step '{name}' to be successful, " + \
"but it failed"
def check_dependencies(_: Results, output: Path) -> SubResGen:
"Check for required programs"
qmsg = " You can find install instructions at " + QLEVER_CONTROL_URL + "."
for prog in REQUIRED_ON_PATH:
p = SimpleSubproc.run(f"which '{prog}'", output)
yield p
assert p.success, \
f"Required program '{prog}' not found on your $PATH. " + \
"Please install it to proceed." + \
(qmsg if prog == "qlever" else "")
log_success(f"Found {p.stdout.rstrip()}")
def check_disk_free(prev: Results, output: Path) -> SubResGen:
"Check for system requirements: disk space"
dep(prev, check_dependencies)
paths = [(output, FREE_SPACE_OUTPUT), (os.environ.get(
"XDG_DATA_HOME") or os.environ.get("HOME"), FREE_SPACE_CONTAINERS)]
for pth, req_space in paths:
pth_ = quote(str(Path(pth).resolve()))
p = SimpleSubproc.run(
f"df --output=avail -B 1 {pth_} | tail -n 1", None)
yield p
assert p.success, f"Could not check for free disk space in '{pth}'"
v = int(p.stdout.rstrip())
if v >= req_space:
log_success(f"Free space in {pth} is ok ({v / GIB} GiB).")
else:
msg = f"The directory '{pth}' has insufficient " + \
"free space available: expected at least " + \
f" {req_space / GIB} GiB, but found {v / GIB} GiB"
if not BYPASS_DISK_CHECK:
raise AssertionError(
msg + ". You can ignore this error at your own risk " +
"using --bypass-disk.")
else:
log_warning(msg)
def check_free_memory(prev: Results, output: Path) -> SubResGen:
"Check for system requirements: free memory"
dep(prev, check_dependencies)
p = SimpleSubproc.run("cat /proc/meminfo", None)
yield p
assert p.success, "Could not check for free memory"
m = re.findall(r'MemAvailable:\s*([0-9]+) kB', p.stdout.rstrip())
assert len(m) == 1, "Invalid memory info from /proc/meminfo"
v = int(m[0]) * 1024
error = f"The free memory {v / GIB} GiB is " + \
"insufficient and may lead to errors. Expecting at least " + \
f"{FREE_MEMORY / GIB} GiB to be available during the entire " + \
"processing. Please also make sure that no programs (like " + \
"'earlyoom') are in place that might prevent programs from " + \
"accessing this amount of memory."
if v < FREE_MEMORY:
if BYPASS_MEMORY_CHECK:
log_warning(error)
else:
raise AssertionError(
error +
" Skip this error at your own risk with '--bypass-mem'.")
else:
log_success(f"Free memory is ok ({v / GIB} GiB).")
def mkdirs(prev: Results, output: Path) -> SubResGen:
"Make subdirectories in output directory"
dep(prev, check_dependencies)
for p in OUTPUT_SUBDIRS:
yield SimpleSubproc.run(
f"mkdir -p {quote(str(Path(output, p)))}", output)
def extract_files(prev: Results, output: Path) -> SubResGen:
"Extract required scripts and files"
dep(prev, check_dependencies, mkdirs)
# If we are running as a standalone zipimport, we need to extract the
# required auxiliary files from __file__
global PROGRAM_DIR
if PROGRAM_DIR.is_file():
temp_pth = Path(output, EXTRACT_PROG_TMP_NAME)
with open(PROGRAM_DIR, "rb") as in_f, \
open(temp_pth, "wb") as out_f:
# Check and skip header
header_exp = "#!/bin/env python3\n"
header_real = in_f.read(len(header_exp))
assert header_real == header_exp.encode("utf-8"), \
"Program bundle invalid"
# Copy body
out_f.write(in_f.read())
# Extract all files into new temporary program dir
new_prog_dir = Path(output, EXTRACT_PROG_DIR)
new_prog_dir.mkdir(exist_ok=True)
with ZipFile(temp_pth, "r") as zf:
zf.extractall(new_prog_dir)
temp_pth.unlink()
log_info("Unpacked zipped reproduction.py code to", new_prog_dir)
yield InternalStep(True)
PROGRAM_DIR = new_prog_dir
# Restore executability flags
for ext in ("py", "sh"):
yield SimpleSubproc.run(
f"find . -name '*.{ext}' -exec chmod +x {{}} \\+",
new_prog_dir)
else:
log_info("The program is not running from an archive. Skipping.")
yield InternalStep(True)
PREP: Steps = [
# expected time: < 1 sec
# Note: The expected times here and in the other lists of steps refer to a
# Ubuntu Server 24.04.1 LTS with AMD Ryzen 9 7950X (32 threads); 128 GB RAM
check_dependencies,
check_disk_free,
check_free_memory,
mkdirs,
extract_files
]
###############################################################################
def build_spatial_container(prev: Results, output: Path) -> SubResGen:
"Build main container image"
dep(prev, *PREP)
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run("podman image inspect localhost/spatial")
return
p = SimpleSubproc.run("podman build -t spatial .", PROGRAM_DIR,
Path(output, "log", "build_spatial.log"))
yield p
assert p.success, "Building main image failed"
def build_stat_repro_container(prev: Results, output: Path) -> SubResGen:
"Build container image for statistical analysis"
dep(prev, *PREP)
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run(
"podman image inspect localhost/spatial_stat_repro")
return
c = "podman build -t spatial_stat_repro " + \
"--file=Dockerfile.stat_repro ."
p = SimpleSubproc.run(c, Path(PROGRAM_DIR, "reproduction", "stat"),
Path(output, "log", "build_stat_repro.log"))
yield p
assert p.success, "Building statistical analysis image failed"
def pull_qlever(prev: Results, output: Path) -> SubResGen:
"Pull qlever container images"
dep(prev, *PREP)
for hash_ in (QLEVER_NEW_HASH, QLEVER_NO_GEOPOINT_HASH):
img = f"{QLEVER_IMAGE}:commit-{hash_[:7]}"
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run("podman image inspect " + img)
continue
p = SimpleSubproc.run(
"podman pull " + img,
PROGRAM_DIR, Path(output, "log", "pull_qlever.log"))
yield p
assert p.success, \
f"Could not pull qlever image for commit hash {hash_}"
def build_osm2rdf(prev: Results, output: Path) -> SubResGen:
"Build osm2rdf container image"
dep(prev, *PREP)
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run("podman image inspect localhost/osm2rdf")
return
c = "podman build -t osm2rdf " + \
"--file=Dockerfile.osm2rdf ."
p = SimpleSubproc.run(c, Path(PROGRAM_DIR, "reproduction", "qlever"),
Path(output, "log", "build_osm2rdf_repro.log"))
yield p
assert p.success, "Building osm2rdf image failed"
def build_postgres_container(prev: Results, output: Path) -> SubResGen:
"Build postgres image for evaluation"
dep(prev, *PREP)
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run("podman image inspect localhost/eval_postgres")
return
c = "podman build -t eval_postgres --file=Dockerfile.postgres ."
p = SimpleSubproc.run(
c, Path(PROGRAM_DIR, "reproduction", "postgres"),
Path(output, "log", "build_postgres.log"))
yield p
assert p.success, "Building postgres evaluation image failed"
def build_spatialjoin_container(prev: Results, output: Path) -> SubResGen:
"Build spatialjoin image for postgres evaluation"
dep(prev, *PREP)
if BYPASS_CONTAINER_BUILDS:
yield SimpleSubproc.run("podman image inspect localhost/spatialjoin")
return
c = "podman build -t spatialjoin --file=Dockerfile.spatialjoin ."
p = SimpleSubproc.run(
c, Path(PROGRAM_DIR, "reproduction", "postgres"),
Path(output, "log", "build_spatialjoin.log"))
yield p
assert p.success, "Building spatialjoin image failed"
BUILD_CONTAINERS: Steps = [
# expected time: 8min
build_spatial_container, # ca. 10 sec
build_stat_repro_container, # ca. 1 min 15 sec
pull_qlever, # ca. 10 sec
build_osm2rdf, # ca. 3 min 10 sec
build_postgres_container, # ca. 20 sec
build_spatialjoin_container, # ca. 2 min 35 sec
]
###############################################################################
def main_container(command: str, dir: Path) -> SimpleSubproc:
"Helper to run a Makefile target in the spatial container"
proc = SimpleSubproc.run(
f"podman run --rm -it -v ./:/output:rw spatial make {command}", dir)
assert proc.success, \
f"Could not run 'make {command}' in 'spatial' container"
return proc
def qlever_clear_cache(port: int) -> SimpleSubproc:
"Helper to request a cache reset on a QLever instance"
proc = SimpleSubproc.run(
f"curl 'http://localhost:{port}/' -X POST " +
"-H 'Accept: application/qlever-results+json' " +
"-H 'Content-Type: application/x-www-form-urlencoded' " +
"--data 'cmd=clear-cache'")
json.loads(proc.stdout) # Should be a valid json
assert proc.success, f"Could not clear cache of QLever instance at {port}."
return proc
def qlever_query(port: int, name: Optional[str], query: Path, output: Path,
qlever_json: bool = True) -> SimpleSubproc:
"Helper to run a SPARQL query on a QLever instance"
accept = "application/qlever-results+json" if qlever_json \
else "text/tab-separated-values"
ext = "json" if qlever_json else "tsv"
out = ""
if name:
out_pth = str(Path(output, "results", f"qlever_{port}_{name}.{ext}"))
out = "-o " + quote(out_pth)
proc = SimpleSubproc.run(
f"curl 'http://localhost:{port}/' " +
"--fail -X POST " +
f"-H 'Accept: {accept}' " +
"-H 'Content-Type: application/sparql-query' " +
f"--data-binary @{quote(str(query))} " + out)
proc.identifier = name
assert proc.success, f"Could not run SPARQL query {str(query)} on " + \
f"QLever instance at {port}."
return proc
###############################################################################
def copy_qleverfiles(prev: Results, output: Path) -> SubResGen:
"Copy Qleverfiles into appropriate directories"
dep(prev, *PREP)
for name in QLEVER_INSTANCES:
source_ = str(Path(PROGRAM_DIR, "reproduction", "qlever",
f"Qleverfile-{name}.ini"))
target_ = str(Path(output, name, "Qleverfile"))
yield SimpleSubproc.run(f"cp {quote(source_)} {quote(target_)}",
PROGRAM_DIR)
def get_data_election(prev: Results, output: Path) -> SubResGen:
"Generate election data sets"
dep(prev, *PREP, build_spatial_container, copy_qleverfiles, pull_qlever)
if "election" not in BYPASS_GET_DATA:
yield SimpleSubproc.run("qlever get-data", Path(output, "election"),
Path(output, "log", "get_data_election.log"))
# Check resulting data
for f, _min, _max in ELECTION_EXPECTED_SIZE:
assert _min <= Path(output, "election", f).stat().st_size <= _max, \
f"Checking {f} failed"
yield InternalStep(True)
def get_data_osm_de(prev: Results, output: Path) -> SubResGen:
"Generate OpenStreetMap Germany data set with auxiliary geometries"
dep(prev, *PREP, build_spatial_container,
pull_qlever, build_osm2rdf, get_data_election, copy_qleverfiles)
# Can also be generated using
# "bzcat *.ttl.bz2 | grep '@prefix' > prefixes.ttl"
el_prefixes = str(Path(PROGRAM_DIR, "election", "prefixes.ttl"))
el_target = str(Path(output, "osm-de"))
cp1 = SimpleSubproc.run(f"cp {quote(el_prefixes)} {quote(el_target)}")
yield cp1
assert cp1.success, "Copying election2rdf prefixes to OSM dir failed"
aux_geo_src = str(Path(output, "election", "btw21-aux-geo.tsv"))
aux_geo_trg = str(Path(output, "osm-de"))
cp2 = SimpleSubproc.run(f"cp {quote(aux_geo_src)} {quote(aux_geo_trg)}")
yield cp2
assert cp2.success, "Copying btw21-aux-geo.tsv to OSM dir failed"
if "osm-de" not in BYPASS_GET_DATA:
yield SimpleSubproc.run("qlever get-data", Path(output, "osm-de"),
Path(output, "log", "get_data_osm-de.log"))
# Check resulting data
f, _min, _max = OSM_DE_EXPECTED_SIZE
assert _min <= Path(output, "osm-de", f).stat().st_size <= _max, \
"Checking osm-germany.ttl.bz2 failed"
yield InternalStep(True)
def get_data_gtfs(prev: Results, output: Path) -> SubResGen:
"Generate GTFS data sets"
dep(prev, *PREP, build_spatial_container, pull_qlever, copy_qleverfiles)
if "gtfs" not in BYPASS_GET_DATA:
yield SimpleSubproc.run("qlever get-data", Path(output, "gtfs"),
Path(output, "log", "get_data_gtfs.log"))
# Check resulting data
for f, _min, _max in GTFS_EXPECTED_SIZE:
assert _min <= Path(output, "gtfs", f).stat().st_size <= _max, \
f"Checking {f} failed"
yield InternalStep(True)
def copy_data_gtfs_no_geopoint(prev: Results, output: Path) -> SubResGen:
"Copy gtfs2rdf output from gtfs to gtfs-no-geopoint"
dep(prev, *PREP, get_data_gtfs)
trg = Path(output, "gtfs-no-geopoint")
for f, _min, _max in GTFS_EXPECTED_SIZE:
pth = Path(output, "gtfs", f)
assert pth.exists()
assert _min <= pth.stat().st_size <= _max, \
f"gtfs2rdf output file {pth} does not have the expected size"
if "gtfs-no-geopoint" in BYPASS_GET_DATA:
assert Path(trg, f).exists(), f + " missing"
yield InternalStep(True)
else:
cp = SimpleSubproc.run(f"cp {quote(str(pth))} {quote(str(trg))}")
yield cp
assert cp.success, f"Copying gtfs2rdf output file {f} failed"
def get_data_postgres(prev: Results, output: Path) -> SubResGen:
"Download OpenStreetMap Germany for PostgreSQL if not already downloaded"
dep(prev, *PREP)
_f, _min, _max = OSM_GERMANY_PBF_SIZE
expected = Path(output, "osm-de", _f)
if expected.exists():
assert _min <= expected.stat().st_size <= _max, \
"OSM Germany PBF file does not have the expected size"
yield InternalStep(True)
return
log_info("Downloading", OSM_GERMANY_PBF)
yield SimpleSubproc.run(
"curl --fail -L -o " + quote(str(Path("osm-de", _f))) + " " +
quote(OSM_GERMANY_PBF), output,
Path(output, "log", "get_data_postgres.log"))
GET_DATA: Steps = [
# expected time: 3 h 32 min
copy_qleverfiles, # ca. 1 sec
get_data_election, # ca. 16 sec
get_data_osm_de, # ca. download 2 min 5 sec + osm2rdf 2 h 12 min
get_data_gtfs, # ca. 1 h 23 min
copy_data_gtfs_no_geopoint, # ca. 2 sec
# ca. Delfi21 35 min + Delfi24 38 min + VAG 35 sec +
# Digitraffic.fi 1 min 56 sec = 1 h 16 min
get_data_postgres # ca. 0 sec (already exists from osm-de)
]
###############################################################################
def qlever_index(instance: str, output: Path) -> SubResGen:
"Helper to run 'qlever index'"
idx_dir = Path(output, instance)
if instance in BYPASS_INDEX_BUILD:
# Check if index is present
n = 0
for glob in QLEVER_INDEX_FILES_GLOB:
i = len(list(idx_dir.glob(glob)))
n += i
assert i > 0, \
"Index build was bypassed, but could not find files " + \
f"matching '{glob}' in directory '{idx_dir}'"
log_info(
f"Bypassed index build for {instance}. {n} index files found.")
yield InternalStep(True)
return
# Build index
index_log = Path(output, "log", f"index_{instance}.log")
p = SimpleSubproc.run("qlever index --overwrite-existing",
Path(output, instance),
index_log)
yield p
log: list[str] = []
with open(index_log, "r") as f:
log = [line.strip() for line in f if line.strip()]
if not log:
log_error("Missing Index Builder log.")
yield InternalStep(False, "", "")
elif "ERROR" in log[-1]:
log_error("The QLever Index Builder crashed with message:", log[-1])
yield InternalStep(False, "", log[-1])
def index_gtfs_no_geopoint(prev: Results, output: Path) -> SubResGen:
"Index GTFS data set using old QLever without GeoPoints"
dep(prev, *PREP, pull_qlever, copy_data_gtfs_no_geopoint)
yield from qlever_index("gtfs-no-geopoint", output)
def index_osm_de(prev: Results, output: Path) -> SubResGen:
"Index OpenStreetMap Germany using up-to-date QLever"
dep(prev, *PREP, pull_qlever, get_data_osm_de)
yield from qlever_index("osm-de", output)
def index_election(prev: Results, output: Path) -> SubResGen:
"Index election data using up-to-date QLever"
dep(prev, *PREP, pull_qlever, get_data_election)
yield from qlever_index("election", output)
def index_gtfs(prev: Results, output: Path) -> SubResGen:
"Index GTFS data sets using up-to-date QLever"
dep(prev, *PREP, pull_qlever, get_data_gtfs)
yield from qlever_index("gtfs", output)
def check_container_status(name: str, status: str) -> SubResGen:
"Helper: Check if a podman container with a given name has a given status"
p = SimpleSubproc.run("podman container inspect " + name)
yield p
statuses = [container["State"]["Status"]
for container in json.loads(p.stdout)]
yield InternalStep(len(statuses) > 0)
yield InternalStep(all(actual == status for actual in statuses))
def check_container_running(name: str) -> SubResGen:
"Helper: Check if a podman container with the given name is running"
yield from check_container_status(name, "running")
def create_postgres(prev: Results, output: Path) -> SubResGen:
"Start PostgreSQL container and initialize database"
dep(prev, *PREP, build_postgres_container)
if "postgres" in BYPASS_INDEX_BUILD:
log_info("Podman Index Build Bypass: Trying to start the container " +
"'eval_postgres' instead of replacing it.")
# Start container
yield SimpleSubproc.run("podman start eval_postgres",
output,
Path(output, "log", "start_postgres.log"))
# Wait for start
log_info("Wait 10 sec for the detached container to start")
sleep(10)
yield InternalStep(True)
# Check if it is running
yield from check_container_running("eval_postgres")
return
results = str(Path(output, "results"))
osm_de = str(Path(output, "osm-de"))
cmd = f"podman run -dt -v {quote(results)}:/output:rw " + \
f"-v {quote(osm_de)}:/mnt:ro --name eval_postgres " + \
"--replace eval_postgres"
yield SimpleSubproc.run(cmd, output,
Path(output, "log", "create_postgres.log"))
log_info("Wait 10 sec for the detached container to start")
sleep(10)
yield InternalStep(True)
def stop_postgres(prev: Results, output: Path) -> SubResGen:
"Stop PostgreSQL container"
dep(prev, *PREP, build_postgres_container, create_postgres)
yield SimpleSubproc.run("podman stop eval_postgres",
output, Path(output, "log", "stop_postgres.log"))
yield from check_container_status("eval_postgres", "exited")
def start_postgres(prev: Results, output: Path) -> SubResGen:
"Start existing PostgreSQL container"
dep(prev, *PREP, build_postgres_container, create_postgres)
yield SimpleSubproc.run("podman start eval_postgres",
output, Path(output, "log", "start_postgres.log"))
log_info("Wait 10 sec for the detached container to start")
sleep(10)
yield InternalStep(True)
yield from check_container_running("eval_postgres")
def run_psql(name: Optional[str], query: str, output: Path,
query_dir: Optional[Path] = None) -> SimpleSubproc:
"Helper: Run a query in the PostgreSQL container"
if not query_dir:
query_dir = Path(PROGRAM_DIR, "reproduction", "postgres")
query_pth = str(Path(query_dir, query))
out = ""
q = ""
log: Path | bool = False
if name:
out_pth = str(Path(output, "results", f"postgres_{name}.csv"))
out = " > " + quote(out_pth)
log = Path(output, "log", "run_postgres_" + name + ".log")
q = "--quiet "
psql = SimpleSubproc.run(
f"podman exec -i -u postgres eval_postgres psql --csv -d osm {q}< " +
quote(query_pth) + out, output, log)
psql.identifier = name
assert psql.success, \
f"PostgreSQL: running query '{query}' failed: '{psql.stderr}'"
return psql
def osm2pgsql(prev: Results, output: Path) -> SubResGen:
"Import OpenStreetMap Germany into PostgreSQL using osm2pgsql"
dep(prev, *PREP, build_postgres_container, create_postgres,
get_data_postgres)
if "postgres" in BYPASS_INDEX_BUILD:
log_info("Bypass Index Build: Checking 'planet_osm_*' tables")
p = run_psql(None, "check_osm2pgsql.sql", output)
yield p
r = re.findall(r'[0-9]+', p.stdout)
assert len(r) == 3, "Not three tables"
for r_count in r:
assert int(r_count) > 1_000_000, "Table contents missing"
return
yield run_psql(None, "drop.sql", output)
yield SimpleSubproc.run(
"podman exec -u postgres eval_postgres osm2pgsql " +
"--database osm /mnt/osm-germany.pbf",
output, Path(output, "log", "osm2pgsql.log"))
def postgres_tables_indexes(prev: Results, output: Path) -> SubResGen:
"PostgreSQL: create tables, indexes, precompute centroids"
dep(prev, *PREP, build_postgres_container, create_postgres, osm2pgsql)
if "postgres" in BYPASS_INDEX_BUILD:
log_info("Bypass Index Build: Checking 'osm_centroids' table")
p = run_psql(None, "check_osm_centroids.sql", output)
yield p
r = re.findall(r'[0-9]+', p.stdout)
assert len(r) == 1 and int(r[0]) > 1_000_000, "Missing osm_centroids"
return
yield run_psql(None, "create_osm_centroids.sql", output)
yield run_psql(None, "idx_centroids.sql", output)
yield run_psql(None, "idx_text.sql", output)
yield run_psql(None, "create_spatialrelations.sql",
output)
def get_postgres_data_size() -> tuple[SimpleSubproc, int]:
"Helper: Collect the current size of the PostgreSQL data directory"
cmd = "podman exec -u postgres eval_postgres bash -c \"du -bcs " + \
"/var/lib/postgresql/data | tail -n 1 | grep -oP '[0-9]+'\""
p = SimpleSubproc.run(cmd)
n = int(p.stdout.strip())
log_info(f"PostgreSQL Data Size: {n} bytes = ca. {round(n / GIB)} GiB")
return p, n
index_size_pg_no_sr: Optional[int] = None
def collect_index_size_pg_no_sr(prev: Results, output: Path) -> SubResGen:
"Collect size of PostgreSQL indexes without spatial relations"