Skip to content

Commit 497c316

Browse files
craig[bot]nameisbhaskar
andcommitted
Merge #131339
131339: drtprod: yaml config for drt clusters r=sambhav-jain-16 ,vidit-bhat a=nameisbhaskar This PR has the YAML configurations for drt-large and drt-chos clusters. These configuration creates the clusters and does the default setup including datadog configuration. It also creates the workload clusters. The datadog setup scripts that are referred in the configuration are also part of thie PR. Fixes: #125381 Epic: None Co-authored-by: Bhaskarjyoti Bora <[email protected]>
2 parents 67dc7a1 + 2c0eb17 commit 497c316

File tree

6 files changed

+417
-0
lines changed

6 files changed

+417
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Yaml for creating and configuring the drt-chaos and workload-chaos clusters. This also configures the datadog.
2+
environment:
3+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
4+
ROACHPROD_DNS: drt.crdb.io
5+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
6+
ROACHPROD_GCE_DNS_ZONE: drt
7+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
8+
CLUSTER: drt-chaos
9+
WORKLOAD_CLUSTER: workload-chaos
10+
11+
targets:
12+
- target_name: $CLUSTER
13+
steps:
14+
- command: create
15+
args:
16+
- $CLUSTER
17+
flags:
18+
clouds: gce
19+
gce-managed: true
20+
gce-enable-multiple-stores: true
21+
gce-zones: "us-east1-d,us-east1-b,us-east1-c"
22+
nodes: 6
23+
gce-machine-type: n2-standard-16
24+
local-ssd: true
25+
gce-local-ssd-count: 4
26+
username: drt
27+
lifetime: 8760h
28+
gce-image: "ubuntu-2204-jammy-v20240319"
29+
on_rollback:
30+
- command: destroy
31+
args:
32+
- $CLUSTER
33+
- command: sync
34+
flags:
35+
clouds: gce
36+
- command: stage
37+
args:
38+
- $CLUSTER
39+
- cockroach
40+
- script: "pkg/cmd/drtprod/configs/setup_datadog_cluster"
41+
- command: start
42+
args:
43+
- $CLUSTER
44+
- "--binary"
45+
- "./cockroach"
46+
flags:
47+
enable-fluent-sink: true
48+
restart: false
49+
sql-port: 26257
50+
on_rollback:
51+
- command: stop
52+
args:
53+
- $CLUSTER
54+
- command: run
55+
args:
56+
- $CLUSTER
57+
- --
58+
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
59+
- target_name: $WORKLOAD_CLUSTER
60+
steps:
61+
- command: create
62+
args:
63+
- $WORKLOAD_CLUSTER
64+
flags:
65+
clouds: gce
66+
gce-zones: "us-east1-c"
67+
nodes: 1
68+
gce-machine-type: n2-standard-8
69+
os-volume-size: 100
70+
username: workload
71+
lifetime: 8760h
72+
on_rollback:
73+
- command: destroy
74+
args:
75+
- $WORKLOAD_CLUSTER
76+
- command: sync
77+
flags:
78+
clouds: gce
79+
- command: stage
80+
args:
81+
- $WORKLOAD_CLUSTER
82+
- cockroach
83+
- command: stage
84+
args:
85+
- $WORKLOAD_CLUSTER
86+
- workload
87+
- script: "pkg/cmd/drtprod/configs/setup_datadog_workload"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Yaml for destroying the drt-chaos and workload-chaos clusters.
2+
environment:
3+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
4+
ROACHPROD_DNS: drt.crdb.io
5+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
6+
ROACHPROD_GCE_DNS_ZONE: drt
7+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
8+
CLUSTER: drt-chaos
9+
WORKLOAD_CLUSTER: workload-chaos
10+
11+
targets:
12+
- target_name: $CLUSTER
13+
steps:
14+
- command: destroy
15+
args:
16+
- $CLUSTER
17+
- target_name: $WORKLOAD_CLUSTER
18+
steps:
19+
- command: destroy
20+
args:
21+
- $WORKLOAD_CLUSTER
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Yaml for creating and configuring the drt-large and workload-large clusters. This also configures the datadog.
2+
environment:
3+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
4+
ROACHPROD_DNS: drt.crdb.io
5+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
6+
ROACHPROD_GCE_DNS_ZONE: drt
7+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
8+
CLUSTER: drt-large
9+
WORKLOAD_CLUSTER: workload-large
10+
11+
targets:
12+
- target_name: $CLUSTER
13+
steps:
14+
- command: create
15+
args:
16+
- $CLUSTER
17+
flags:
18+
clouds: gce
19+
gce-managed: true
20+
gce-enable-multiple-stores: true
21+
gce-zones: "northamerica-northeast2-a:2,northamerica-northeast2-b:2,northamerica-northeast2-c:1,us-east5-a:2,us-east5-b:2,us-east5-c:1,us-east1-b:2,us-east1-c:2,us-east1-d:1"
22+
nodes: 15
23+
gce-machine-type: n2-standard-16
24+
local-ssd: true
25+
gce-local-ssd-count: 4
26+
os-volume-size: 100
27+
username: drt
28+
lifetime: 8760h
29+
on_rollback:
30+
- command: destroy
31+
args:
32+
- $CLUSTER
33+
- command: sync
34+
flags:
35+
clouds: gce
36+
- command: stage
37+
args:
38+
- $CLUSTER
39+
- cockroach
40+
- script: "pkg/cmd/drtprod/configs/setup_datadog_cluster"
41+
- command: start
42+
args:
43+
- $CLUSTER
44+
- "--binary"
45+
- "./cockroach"
46+
flags:
47+
enable-fluent-sink: true
48+
store-count: 4
49+
restart: false
50+
sql-port: 26257
51+
on_rollback:
52+
- command: stop
53+
args:
54+
- $CLUSTER
55+
- command: run
56+
args:
57+
- $CLUSTER
58+
- --
59+
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
60+
- command: sql
61+
args:
62+
- $CLUSTER:1
63+
- --
64+
- -e
65+
- "ALTER RANGE timeseries CONFIGURE ZONE USING num_replicas=5,num_voters=5"
66+
- command: sql
67+
args:
68+
- $CLUSTER:1
69+
- --
70+
- -e
71+
- "ALTER RANGE default CONFIGURE ZONE USING num_replicas=5,num_voters=5"
72+
- target_name: $WORKLOAD_CLUSTER
73+
steps:
74+
- command: create
75+
args:
76+
- $WORKLOAD_CLUSTER
77+
flags:
78+
clouds: gce
79+
gce-zones: "northamerica-northeast2-a,us-east5-a,us-east1-b"
80+
nodes: 3
81+
gce-machine-type: n2d-standard-4
82+
os-volume-size: 100
83+
username: workload
84+
lifetime: 8760h
85+
on_rollback:
86+
- command: destroy
87+
args:
88+
- $WORKLOAD_CLUSTER
89+
- command: sync
90+
flags:
91+
clouds: gce
92+
- command: stage
93+
args:
94+
- $WORKLOAD_CLUSTER
95+
- cockroach
96+
- command: stage
97+
args:
98+
- $WORKLOAD_CLUSTER
99+
- workload
100+
- script: "pkg/cmd/drtprod/configs/setup_datadog_workload"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Yaml for destroying the drt-large and workload-large clusters.
2+
environment:
3+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
4+
ROACHPROD_DNS: drt.crdb.io
5+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
6+
ROACHPROD_GCE_DNS_ZONE: drt
7+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
8+
CLUSTER: drt-large
9+
WORKLOAD_CLUSTER: workload-large
10+
11+
targets:
12+
- target_name: $CLUSTER
13+
steps:
14+
- command: destroy
15+
args:
16+
- $CLUSTER
17+
- target_name: $WORKLOAD_CLUSTER
18+
steps:
19+
- command: destroy
20+
args:
21+
- $WORKLOAD_CLUSTER
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
# Sets up datadog for the drt clusters.
4+
# NOTE - This uses CLUSTER environment variable, if not set the script fails
5+
6+
if [ -z "${CLUSTER}" ]; then
7+
echo "environment CLUSTER is not set"
8+
exit 1
9+
fi
10+
11+
# TODO - this command does not work. We need to replace this with the actual dd_api_key for the script to work
12+
13+
dd_api_key="$(gcloud --project=cockroach-drt secrets versions access latest --secret datadog-api-key)"
14+
15+
16+
if [ -z "${dd_api_key}" ]; then
17+
echo "Missing Datadog API key!"
18+
exit 1
19+
fi
20+
21+
dd_site="us5.datadoghq.com"
22+
23+
roachprod ssh $CLUSTER -- "sudo mkdir -p /etc/fluent-bit && sudo tee /etc/fluent-bit/config-override.yaml > /dev/null << EOF
24+
---
25+
pipeline:
26+
inputs:
27+
- name: tail
28+
path: /var/log/audit/audit.log
29+
tag: audit
30+
key: message
31+
storage.type: filesystem
32+
alias: audit
33+
outputs:
34+
- name: datadog
35+
match: audit
36+
host: http-intake.logs.${dd_site}
37+
tls: on
38+
compress: gzip
39+
apikey: ${dd_api_key}
40+
dd_source: audit
41+
dd_service: drt-cockroachdb
42+
dd_tags: env:development,cluster:${cluster%:*},service:drt-cockroachdb,team:drt
43+
alias: audit
44+
storage.total_limit_size: 25MB
45+
EOF"
46+
47+
roachprod ssh $CLUSTER -- "sudo tee /etc/profile.d/99-datadog.sh > /dev/null << EOF
48+
export DD_SITE=${dd_site}
49+
export DD_API_KEY=${dd_api_key}
50+
export DD_TAGS=env:development,cluster${CLUSTER%:*},team:drt,service:drt-cockroachdb
51+
EOF"
52+
53+
roachprod opentelemetry-start $CLUSTER \
54+
--datadog-api-key "${dd_api_key}" \
55+
--datadog-tags 'service:drt-cockroachdb,team:drt'
56+
57+
roachprod fluent-bit-start $CLUSTER \
58+
--datadog-api-key "${dd_api_key}" \
59+
--datadog-service drt-cockroachdb \
60+
--datadog-tags 'service:drt-cockroachdb,team:drt'
61+
62+
echo
63+
echo "Updated $CLUSTER configuration to send telemetry data to Datadog."
64+
echo
65+
echo "If this was the first time this script was run against $CLUSTER then"
66+
echo "CockroachDB must be restarted to reload its logging configuration."
67+
echo
68+
69+
exit 0

0 commit comments

Comments
 (0)