Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/plugins/mzcompose/hooks/command
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ cleanup() {
&& [ "$BUILDKITE_LABEL" != "Parallel Benchmark against QA Benchmarking Staging Environment" ] \
&& [[ ! "$BUILDKITE_LABEL" =~ Terraform\ .* ]] \
&& [[ ! "$BUILDKITE_LABEL" =~ Orchestratord\ test\ .* ]] \
&& [ "$BUILDKITE_LABEL" != "Cluster spec sheet" ]; then
&& [[ ! "$BUILDKITE_LABEL" =~ Cluster\ spec\ sheet.* ]]; then
echo "+++ services.log is empty, failing"
exit 1
fi
Expand Down
43 changes: 30 additions & 13 deletions ci/release-qualification/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -530,16 +530,33 @@ steps:
agents:
queue: hetzner-x86-64-dedi-48cpu-192gb # 1 TB disk

- id: cluster-spec-sheet
label: Cluster spec sheet
depends_on: build-aarch64
timeout_in_minutes: 3600
concurrency: 1
concurrency_group: 'cluster-spec-sheet'
plugins:
- ./ci/plugins/mzcompose:
composition: cluster-spec-sheet
run: default
args: [--cleanup]
agents:
queue: linux-aarch64-small
- group: Cluster spec sheet
key: cluster-spec-sheet
steps:
- id: cluster-spec-sheet-cluster
label: "Cluster spec sheet: Cluster (against Production)"
depends_on: build-aarch64
timeout_in_minutes: 3600
concurrency: 1
concurrency_group: 'cluster-spec-sheet'
plugins:
- ./ci/plugins/mzcompose:
composition: cluster-spec-sheet
run: default
args: [--cleanup, --target=cloud-production, cluster]
agents:
queue: linux-aarch64-small

- id: cluster-spec-sheet-environmentd
label: "Cluster spec sheet: Environmentd (against Staging)"
depends_on: build-aarch64
timeout_in_minutes: 3600
concurrency: 1
concurrency_group: 'cluster-spec-sheet-cluster'
plugins:
- ./ci/plugins/mzcompose:
composition: cluster-spec-sheet
run: default
args: [--cleanup, --target=cloud-staging, environmentd]
agents:
queue: linux-aarch64-small
57 changes: 49 additions & 8 deletions misc/python/materialize/mzcompose/composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,16 +1667,57 @@ def promote_mz(self, mz_service: str = "materialized") -> None:
)
assert result["result"] == "Success", f"Unexpected result {result}"

def cloud_hostname(self, quiet: bool = False) -> str:
"""Uses the mz command line tool to get the hostname of the cloud instance"""
def cloud_hostname(
self, quiet: bool = False, timeout_secs: int = 180, poll_interval: float = 2.0
) -> str:
"""Uses the mz command line tool to get the hostname of the cloud instance, waiting until the region is ready."""
Copy link
Contributor Author

@ggevay ggevay Nov 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to add a retry loop here, because sometimes it would fail despite mz region enable already having come back saying that the cloud side of things are done. I think this started to happen when I added an mz region disable before my mz region enables. (The reason for adding the disabling is explained in a code comment in cloud_disable_enable_and_wait.)

if not quiet:
print("Obtaining hostname of cloud instance ...")
region_status = self.run("mz", "region", "show", capture=True, rm=True)
sql_line = region_status.stdout.split("\n")[2]
cloud_url = sql_line.split("\t")[1].strip()
# It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
cloud_hostname = urllib.parse.urlparse("https://" + cloud_url).hostname
return str(cloud_hostname)

deadline = time.time() + timeout_secs
last_msg = ""

while time.time() < deadline:
proc = self.run(
"mz",
"region",
"show",
capture=True,
capture_stderr=True,
rm=True,
check=False,
silent=True,
)
out = proc.stdout or ""
err = proc.stderr or ""

if proc.returncode == 0:
lines = out.splitlines()
if len(lines) >= 3:
line = lines[2]
parts = line.split("\t")
if len(parts) >= 2:
cloud_url = parts[1].strip()
# It is necessary to append the 'https://' protocol; otherwise, urllib can't parse it correctly.
hostname = urllib.parse.urlparse(
"https://" + cloud_url
).hostname
if hostname:
return str(hostname)
else:
last_msg = f"failed to parse hostname from URL: {cloud_url}"
else:
last_msg = f"unexpected region show output (no tab in line 3): {line!r}"
else:
last_msg = f"unexpected region show output (too few lines): {out!r}"
else:
last_msg = (out + "\n" + err).strip()

time.sleep(poll_interval)

raise UIError(
f"failed to obtain cloud hostname within {timeout_secs}s: {last_msg}"
)

T = TypeVar("T")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@ class ClusterSpecSheetResultEntry:
time_ms: int | None


@dataclass
class ClusterSpecSheetEnvironmentdResultEntry:
scenario: str
scenario_version: str
scale: int
mode: str
category: str
test_name: str
envd_cpus: int
repetition: int
qps: float | None


class ClusterSpecSheetResultStorage(BaseDataStorage):

def add_result(
Expand Down Expand Up @@ -76,3 +89,51 @@ def add_result(
)

self.database_connector.add_update_statements(sql_statements)


class ClusterSpecSheetEnvironmentdResultStorage(BaseDataStorage):

def add_result(
self,
framework_version: str,
results: list[ClusterSpecSheetEnvironmentdResultEntry],
) -> None:
job_id = buildkite.get_var(BuildkiteEnvVar.BUILDKITE_JOB_ID)

sql_statements = []

for result_entry in results:
# TODO: remove NULL castings when database-issues#8100 is resolved
sql_statements.append(
f"""
INSERT INTO cluster_spec_sheet_environmentd_result
(
build_job_id,
framework_version,
scenario,
scenario_version,
scale,
mode,
category,
test_name,
envd_cpus,
repetition,
qps
)
SELECT
{as_sanitized_literal(job_id)},
{as_sanitized_literal(framework_version)},
{as_sanitized_literal(result_entry.scenario)},
{as_sanitized_literal(result_entry.scenario_version)},
{result_entry.scale},
{as_sanitized_literal(result_entry.mode)},
{as_sanitized_literal(result_entry.category)},
{as_sanitized_literal(result_entry.test_name)},
{result_entry.envd_cpus},
{result_entry.repetition},
{result_entry.qps or 'NULL::FLOAT'}
;
"""
)

self.database_connector.add_update_statements(sql_statements)
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ DELETE FROM scalability_framework_result WHERE build_job_id IN (SELECT build_id
DELETE FROM parallel_benchmark_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
DELETE FROM product_limits_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
DELETE FROM cluster_spec_sheet_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
DELETE FROM cluster_spec_sheet_environmentd_result WHERE build_job_id IN (SELECT build_id FROM build_job WHERE build_id IN (%build-ids%));
DELETE FROM build_annotation_error WHERE build_job_id IN (SELECT build_job_id FROM build_annotation WHERE build_id IN (%build-ids%));
DELETE FROM build_annotation WHERE build_id IN (%build-ids%);
DELETE FROM build_job WHERE build_id IN (%build-ids%);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-- Copyright Materialize, Inc. and contributors. All rights reserved.
--
-- Use of this software is governed by the Business Source License
-- included in the LICENSE file at the root of this repository.
--
-- As of the Change Date specified in that file, in accordance with
-- the Business Source License, use of this software will be governed
-- by the Apache License, Version 2.0.


-- result of individual product limits scenarios
CREATE TABLE cluster_spec_sheet_environmentd_result (
build_job_id TEXT NOT NULL,
framework_version TEXT NOT NULL,
scenario TEXT NOT NULL,
scenario_version TEXT NOT NULL,
scale INT NOT NULL,
mode TEXT NOT NULL,
category TEXT NOT NULL,
test_name TEXT NOT NULL,
envd_cpus INT NOT NULL,
repetition INT NOT NULL,
qps FLOAT
);

ALTER TABLE cluster_spec_sheet_environmentd_result OWNER TO qa;
GRANT SELECT, INSERT, UPDATE ON TABLE cluster_spec_sheet_environmentd_result TO "hetzner-ci";
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ CREATE OR REPLACE VIEW v_data_integrity (table_name, own_item_key, referenced_it
FROM cluster_spec_sheet_result
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
UNION
SELECT 'cluster_spec_sheet_environmentd_result', build_job_id, build_job_id, 'cluster spec sheet environmentd result references missing build job'
FROM cluster_spec_sheet_environmentd_result
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
UNION
SELECT 'build_annotation', build_job_id, build_job_id, 'build annotation references missing build job'
FROM build_annotation
WHERE build_job_id NOT IN (SELECT build_job_id FROM build_job)
Expand Down
4 changes: 4 additions & 0 deletions misc/python/materialize/test_analytics/test_analytics_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
BuildAnnotationStorage,
)
from materialize.test_analytics.data.cluster_spec_sheet.cluster_spec_sheet_result_storage import (
ClusterSpecSheetEnvironmentdResultStorage,
ClusterSpecSheetResultStorage,
)
from materialize.test_analytics.data.feature_benchmark.feature_benchmark_result_storage import (
Expand Down Expand Up @@ -79,6 +80,9 @@ def __init__(self, config: MzDbConfig):
self.cluster_spec_sheet_results = ClusterSpecSheetResultStorage(
self.database_connector
)
self.cluster_spec_sheet_environmentd_results = (
ClusterSpecSheetEnvironmentdResultStorage(self.database_connector)
)

def _create_database_connector(self, config: MzDbConfig) -> DatabaseConnector:
if config.enabled:
Expand Down
8 changes: 6 additions & 2 deletions test/cluster-spec-sheet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ Reproduce data for the cluster spec sheet effort.

This will run all scenarios currently defined for the cluster spec sheet.

The test expects a default cluster.

Pass `--cleanup` to disable the region after the test.

# Running
Expand Down Expand Up @@ -48,3 +46,9 @@ In this case, the environment variables are not required.
```
bin/mzcompose --find cluster-spec-sheet run default --target=docker
```

## Scenarios

There are two kinds of scenarios:
- cluster scaling: These measure run times and arrangement sizes.
- envd scaling: These measure QPS.
Comment on lines +50 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please document how to invoke each.

Loading