Skip to content

Commit 6e1021a

Browse files
authored
[internal-dns] fix for clickhouse zone services (#7612)
This commit fixes an issue where the `ClickhouseNative` SRV record was pointing to both `clickhouse` and `clickhouse_cluster` IPs. This was causing Oximeter to indiscriminately write to either single node or cluster when both where running side by side. Additionally, we add a record for `ClickhouseSingleServerAdmin`. ## Manual testing on a local omicron deployment ### With only single node enabled ```console coatlicue@centzon:~/src/omicron$ pfexec zlogin oxz_clickhouse_ae7bf43d-8234-4b6f-ab3a-540685f5ba41 [Connected to zone 'oxz_clickhouse_ae7bf43d-8234-4b6f-ab3a-540685f5ba41' pts/3] The illumos Project helios-2.0.23078 December 2024 root@oxz_clickhouse_ae7bf43d:~# nslookup -type=SRV _clickhouse-native._tcp.control-plane.oxide.internal ;; Got recursion not available from fd00:1122:3344:1::1, trying next server ;; Got recursion not available from fd00:1122:3344:2::1, trying next server Server: fd00:1122:3344:3::1 Address: fd00:1122:3344:3::1#53 Non-authoritative answer: _clickhouse-native._tcp.control-plane.oxide.internal service = 0 0 9000 ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal. Authoritative answers can be found from: ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::e root@oxz_clickhouse_ae7bf43d:~# nslookup -type=SRV _clickhouse-admin-single-server._tcp.control-plane.oxide.internal ;; Got recursion not available from fd00:1122:3344:1::1, trying next server ;; Got recursion not available from fd00:1122:3344:2::1, trying next server Server: fd00:1122:3344:3::1 Address: fd00:1122:3344:3::1#53 Non-authoritative answer: _clickhouse-admin-single-server._tcp.control-plane.oxide.internal service = 0 0 8888 ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal. Authoritative answers can be found from: ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::e ``` ### With both single node and cluster enabled ```console root@oxz_clickhouse_ae7bf43d:~# nslookup -type=SRV _clickhouse-admin-single-server._tcp.control-plane.oxide.internal ;; Got recursion not available from fd00:1122:3344:1::1, trying next server ;; Got recursion not available from fd00:1122:3344:2::1, trying next server Server: fd00:1122:3344:3::1 Address: fd00:1122:3344:3::1#53 Non-authoritative answer: _clickhouse-admin-single-server._tcp.control-plane.oxide.internal service = 0 0 8888 ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal. Authoritative answers can be found from: ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::e root@oxz_clickhouse_ae7bf43d:~# nslookup -type=SRV _clickhouse-native._tcp.control-plane.oxide.internal ;; Got recursion not available from fd00:1122:3344:1::1, trying next server ;; Got recursion not available from fd00:1122:3344:2::1, trying next server Server: fd00:1122:3344:3::1 Address: fd00:1122:3344:3::1#53 Non-authoritative answer: _clickhouse-native._tcp.control-plane.oxide.internal service = 0 0 9000 ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal. Authoritative answers can be found from: ae7bf43d-8234-4b6f-ab3a-540685f5ba41.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::e root@oxz_clickhouse_ae7bf43d:~# nslookup -type=SRV _clickhouse-admin-server._tcp.control-plane.oxide.internal ;; Got recursion not available from fd00:1122:3344:1::1, trying next server ;; Got recursion not available from fd00:1122:3344:2::1, trying next server Server: fd00:1122:3344:3::1 Address: fd00:1122:3344:3::1#53 Non-authoritative answer: _clickhouse-admin-server._tcp.control-plane.oxide.internal service = 0 0 8888 1ebb94b9-9cc4-4c4a-8402-f758cc3b1173.host.control-plane.oxide.internal. _clickhouse-admin-server._tcp.control-plane.oxide.internal service = 0 0 8888 b817f829-383f-402a-be4a-b393c1afdff0.host.control-plane.oxide.internal. _clickhouse-admin-server._tcp.control-plane.oxide.internal service = 0 0 8888 bdc23f73-83f9-4029-a749-2375d8cb4033.host.control-plane.oxide.internal. Authoritative answers can be found from: 1ebb94b9-9cc4-4c4a-8402-f758cc3b1173.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::26 b817f829-383f-402a-be4a-b393c1afdff0.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::27 bdc23f73-83f9-4029-a749-2375d8cb4033.host.control-plane.oxide.internal has AAAA address fd00:1122:3344:101::28 ``` #### single-node ```console root@oxz_clickhouse_ae7bf43d:~# /opt/oxide/clickhouse/clickhouse client --host fd00:1122:3344:101::e -q "select * from oximeter.fields_string limit 5" ddm_router:originated_tunnel_endpoints 1927403117836933753 hostname centzon ddm_router:originated_tunnel_endpoints 13387177631086028603 hostname oxz_switch ddm_router:originated_underlay_prefixes 12164380667314673492 hostname centzon ddm_router:originated_underlay_prefixes 9657734132044246100 hostname oxz_switch ddm_session:advertisements_received 5533636839163709296 hostname centzon ``` #### cluster ```console oximeter_cluster_1 :) select * from oximeter.fields_string limit 5 SELECT * FROM oximeter.fields_string LIMIT 5 Query id: c4771ce0-1b36-4f48-aca9-068ec826a67b Ok. 0 rows in set. Elapsed: 0.002 sec. ``` Closes: #7577
1 parent 3721385 commit 6e1021a

File tree

6 files changed

+190
-36
lines changed

6 files changed

+190
-36
lines changed

internal-dns/types/src/config.rs

Lines changed: 92 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -399,55 +399,90 @@ impl DnsConfigBuilder {
399399
self.service_backend_zone(ServiceName::Mgd, &zone, mgd_port)
400400
}
401401

402-
/// Higher-level shorthand for adding a ClickHouse zone with several
402+
/// Higher-level shorthand for adding a ClickHouse single node zone with
403+
/// several services.
404+
///
405+
/// The ClickHouse single node server exposes several interfaces on the
406+
/// network. We use both a simple HTTP interface as well as a lower-level
407+
/// protocol over TCP, called the "Native protocol". This method inserts a
408+
/// zone and the related records for both of these services.
409+
///
410+
/// `http_service` is the `ServiceName` for the HTTP service that belongs in
411+
/// this zone, and `http_port` is the associated port for that service. The
412+
/// native service is added automatically, using its default port.
413+
///
414+
/// We also add a `ClickhouseAdminSingleServer` service.
415+
///
416+
/// # Errors
417+
///
418+
/// This fails if the provided `http_service` is not for a ClickHouse single
419+
/// node server. It also fails if the given zone has already been added
420+
/// to the configuration.
421+
pub fn host_zone_clickhouse_single_node(
422+
&mut self,
423+
zone_id: OmicronZoneUuid,
424+
http_service: ServiceName,
425+
http_address: SocketAddrV6,
426+
) -> anyhow::Result<()> {
427+
anyhow::ensure!(
428+
http_service == ServiceName::Clickhouse,
429+
"This method is only valid for the ClickHouse single node server, \
430+
but we were provided the service '{http_service:?}'",
431+
);
432+
let zone = self.host_zone(zone_id, *http_address.ip())?;
433+
self.service_backend_zone(http_service, &zone, http_address.port())?;
434+
435+
self.service_backend_zone(
436+
ServiceName::ClickhouseNative,
437+
&zone,
438+
CLICKHOUSE_TCP_PORT,
439+
)?;
440+
self.service_backend_zone(
441+
ServiceName::ClickhouseAdminSingleServer,
442+
&zone,
443+
CLICKHOUSE_ADMIN_PORT,
444+
)
445+
}
446+
447+
/// Higher-level shorthand for adding a ClickHouse cluster zone with several
403448
/// services.
404449
///
405450
/// ClickHouse servers expose several interfaces on the network. We use both
406451
/// a simple HTTP interface as well as a lower-level protocol over TCP,
407452
/// called the "Native protocol". This method inserts a zone and the related
408453
/// records for both of these services.
454+
/// (TODO-<https://github.com/oxidecomputer/omicron/issues/7419:> Add Native protocol
455+
/// interface)
409456
///
410457
/// `http_service` is the `ServiceName` for the HTTP service that belongs in
411458
/// this zone, and `http_port` is the associated port for that service. The
412459
/// native service is added automatically, using its default port.
413460
///
414-
/// For `ClickhouseServer` zones we also need to add a
415-
/// `ClickhouseAdminServer` service.
461+
/// We also add a `ClickhouseAdminServer` service.
416462
///
417463
/// # Errors
418464
///
419-
/// This fails if the provided `http_service` is not for a ClickHouse
465+
/// This fails if the provided `http_service` is not for a ClickHouse cluster
420466
/// replica server. It also fails if the given zone has already been added
421467
/// to the configuration.
422-
pub fn host_zone_clickhouse(
468+
pub fn host_zone_clickhouse_cluster(
423469
&mut self,
424470
zone_id: OmicronZoneUuid,
425471
http_service: ServiceName,
426472
http_address: SocketAddrV6,
427473
) -> anyhow::Result<()> {
428474
anyhow::ensure!(
429-
http_service == ServiceName::Clickhouse
430-
|| http_service == ServiceName::ClickhouseServer,
431-
"This method is only valid for ClickHouse replica servers, \
475+
http_service == ServiceName::ClickhouseServer,
476+
"This method is only valid for ClickHouse cluster replica servers, \
432477
but we were provided the service '{http_service:?}'",
433478
);
434479
let zone = self.host_zone(zone_id, *http_address.ip())?;
435480
self.service_backend_zone(http_service, &zone, http_address.port())?;
436481
self.service_backend_zone(
437-
ServiceName::ClickhouseNative,
482+
ServiceName::ClickhouseAdminServer,
438483
&zone,
439-
CLICKHOUSE_TCP_PORT,
440-
)?;
441-
442-
if http_service == ServiceName::ClickhouseServer {
443-
self.service_backend_zone(
444-
ServiceName::ClickhouseAdminServer,
445-
&zone,
446-
CLICKHOUSE_ADMIN_PORT,
447-
)?;
448-
}
449-
450-
Ok(())
484+
CLICKHOUSE_ADMIN_PORT,
485+
)
451486
}
452487

453488
/// Higher-level shorthand for adding a ClickhouseKeeper zone with several
@@ -687,7 +722,9 @@ mod test {
687722
use crate::{config::Zone, names::DNS_ZONE};
688723
use omicron_common::api::external::Generation;
689724
use omicron_uuid_kinds::{OmicronZoneUuid, SledUuid};
690-
use std::{collections::BTreeMap, io::Write, net::Ipv6Addr};
725+
use std::{
726+
collections::BTreeMap, io::Write, net::Ipv6Addr, net::SocketAddrV6,
727+
};
691728

692729
#[test]
693730
fn display_srv_service() {
@@ -700,10 +737,18 @@ mod test {
700737
ServiceName::ClickhouseAdminServer.dns_name(),
701738
"_clickhouse-admin-server._tcp",
702739
);
740+
assert_eq!(
741+
ServiceName::ClickhouseAdminSingleServer.dns_name(),
742+
"_clickhouse-admin-single-server._tcp",
743+
);
703744
assert_eq!(
704745
ServiceName::ClickhouseKeeper.dns_name(),
705746
"_clickhouse-keeper._tcp",
706747
);
748+
assert_eq!(
749+
ServiceName::ClickhouseNative.dns_name(),
750+
"_clickhouse-native._tcp",
751+
);
707752
assert_eq!(
708753
ServiceName::ClickhouseServer.dns_name(),
709754
"_clickhouse-server._tcp",
@@ -756,12 +801,19 @@ mod test {
756801
const ZONE2_UUID: &'static str = "001de000-c04e-4000-8000-000000000002";
757802
const ZONE3_UUID: &'static str = "001de000-c04e-4000-8000-000000000003";
758803
const ZONE4_UUID: &'static str = "001de000-c04e-4000-8000-000000000004";
804+
const ZONE_CLICKHOUSE_UUID: &'static str =
805+
"001de000-c04e-4000-8000-000000000005";
806+
const ZONE_CLICKHOUSE_SERVER_UUID: &'static str =
807+
"001de000-c04e-4000-8000-000000000006";
759808
const SLED1_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1);
760809
const SLED2_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 2);
761810
const ZONE1_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 1);
762811
const ZONE2_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 2);
763812
const ZONE3_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 3);
764813
const ZONE4_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 4);
814+
const ZONE_CLICKHOUSE_IP: Ipv6Addr = Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 5);
815+
const ZONE_CLICKHOUSE_SERVER_IP: Ipv6Addr =
816+
Ipv6Addr::new(0, 0, 0, 0, 0, 0, 1, 6);
765817

766818
#[test]
767819
fn test_builder_output() {
@@ -773,6 +825,10 @@ mod test {
773825
let zone2_uuid: OmicronZoneUuid = ZONE2_UUID.parse().unwrap();
774826
let zone3_uuid: OmicronZoneUuid = ZONE3_UUID.parse().unwrap();
775827
let zone4_uuid: OmicronZoneUuid = ZONE4_UUID.parse().unwrap();
828+
let zone_clickhouse_uuid: OmicronZoneUuid =
829+
ZONE_CLICKHOUSE_UUID.parse().unwrap();
830+
let zone_clickhouse_server_uuid: OmicronZoneUuid =
831+
ZONE_CLICKHOUSE_SERVER_UUID.parse().unwrap();
776832

777833
let builder_empty = DnsConfigBuilder::new();
778834

@@ -818,6 +874,20 @@ mod test {
818874
b.service_backend_zone(ServiceName::BoundaryNtp, &zone2, 127)
819875
.unwrap();
820876

877+
// Add clickhouse and clickhouse server zones, which have serveral services each
878+
b.host_zone_clickhouse_single_node(
879+
zone_clickhouse_uuid,
880+
ServiceName::Clickhouse,
881+
SocketAddrV6::new(ZONE_CLICKHOUSE_IP, 0, 0, 0),
882+
)
883+
.unwrap();
884+
b.host_zone_clickhouse_cluster(
885+
zone_clickhouse_server_uuid,
886+
ServiceName::ClickhouseServer,
887+
SocketAddrV6::new(ZONE_CLICKHOUSE_SERVER_IP, 0, 0, 0),
888+
)
889+
.unwrap();
890+
821891
// A sharded service
822892
b.service_backend_sled(
823893
ServiceName::SledAgent(sled1_uuid),

internal-dns/types/src/names.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ pub enum ServiceName {
2929
ClickhouseAdminKeeper,
3030
/// The HTTP interface for managing replicated clickhouse servers
3131
ClickhouseAdminServer,
32+
/// The HTTP interface for managing a single node clickhouse server
33+
ClickhouseAdminSingleServer,
3234
/// The native TCP interface to a ClickHouse server.
3335
///
3436
/// NOTE: This is used for either single-node or a replicated cluster.
@@ -61,6 +63,9 @@ impl ServiceName {
6163
ServiceName::Clickhouse => "clickhouse",
6264
ServiceName::ClickhouseAdminKeeper => "clickhouse-admin-keeper",
6365
ServiceName::ClickhouseAdminServer => "clickhouse-admin-server",
66+
ServiceName::ClickhouseAdminSingleServer => {
67+
"clickhouse-admin-single-server"
68+
}
6469
ServiceName::ClickhouseNative => "clickhouse-native",
6570
ServiceName::ClickhouseKeeper => "clickhouse-keeper",
6671
ServiceName::ClickhouseServer => "clickhouse-server",
@@ -90,6 +95,7 @@ impl ServiceName {
9095
ServiceName::Clickhouse
9196
| ServiceName::ClickhouseAdminKeeper
9297
| ServiceName::ClickhouseAdminServer
98+
| ServiceName::ClickhouseAdminSingleServer
9399
| ServiceName::ClickhouseNative
94100
| ServiceName::ClickhouseKeeper
95101
| ServiceName::ClickhouseServer

internal-dns/types/tests/output/internal-dns-zone.txt

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,18 @@ builder: "non_trivial"
6868
"data": "::1:4"
6969
}
7070
],
71+
"001de000-c04e-4000-8000-000000000005.host": [
72+
{
73+
"type": "AAAA",
74+
"data": "::1:5"
75+
}
76+
],
77+
"001de000-c04e-4000-8000-000000000006.host": [
78+
{
79+
"type": "AAAA",
80+
"data": "::1:6"
81+
}
82+
],
7183
"_boundary-ntp._tcp": [
7284
{
7385
"type": "SRV",
@@ -79,6 +91,61 @@ builder: "non_trivial"
7991
}
8092
}
8193
],
94+
"_clickhouse-admin-server._tcp": [
95+
{
96+
"type": "SRV",
97+
"data": {
98+
"prio": 0,
99+
"weight": 0,
100+
"port": 8888,
101+
"target": "001de000-c04e-4000-8000-000000000006.host.control-plane.oxide.internal"
102+
}
103+
}
104+
],
105+
"_clickhouse-admin-single-server._tcp": [
106+
{
107+
"type": "SRV",
108+
"data": {
109+
"prio": 0,
110+
"weight": 0,
111+
"port": 8888,
112+
"target": "001de000-c04e-4000-8000-000000000005.host.control-plane.oxide.internal"
113+
}
114+
}
115+
],
116+
"_clickhouse-native._tcp": [
117+
{
118+
"type": "SRV",
119+
"data": {
120+
"prio": 0,
121+
"weight": 0,
122+
"port": 9000,
123+
"target": "001de000-c04e-4000-8000-000000000005.host.control-plane.oxide.internal"
124+
}
125+
}
126+
],
127+
"_clickhouse-server._tcp": [
128+
{
129+
"type": "SRV",
130+
"data": {
131+
"prio": 0,
132+
"weight": 0,
133+
"port": 0,
134+
"target": "001de000-c04e-4000-8000-000000000006.host.control-plane.oxide.internal"
135+
}
136+
}
137+
],
138+
"_clickhouse._tcp": [
139+
{
140+
"type": "SRV",
141+
"data": {
142+
"prio": 0,
143+
"weight": 0,
144+
"port": 0,
145+
"target": "001de000-c04e-4000-8000-000000000005.host.control-plane.oxide.internal"
146+
}
147+
}
148+
],
82149
"_nexus._tcp": [
83150
{
84151
"type": "SRV",

nexus/test-utils/src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,11 @@ impl RackInitRequestBuilder {
313313
address: SocketAddrV6,
314314
) {
315315
self.internal_dns_config
316-
.host_zone_clickhouse(zone_id, ServiceName::Clickhouse, address)
316+
.host_zone_clickhouse_single_node(
317+
zone_id,
318+
ServiceName::Clickhouse,
319+
address,
320+
)
317321
.expect("Failed to setup ClickHouse DNS");
318322
}
319323
}

nexus/types/src/deployment/execution/dns.rs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,25 +48,28 @@ pub fn blueprint_internal_dns_config(
4848
) => (ServiceName::InternalNtp, address),
4949
BlueprintZoneType::Clickhouse(
5050
blueprint_zone_type::Clickhouse { address, .. },
51-
)
52-
| BlueprintZoneType::ClickhouseServer(
51+
) => {
52+
// Add the HTTP and native TCP interfaces for ClickHouse data
53+
// replicas. This adds the zone itself, so we need to continue
54+
// back up to the loop over all the Omicron zones, rather than
55+
// falling through to call `host_zone_with_one_backend()`.
56+
dns_builder.host_zone_clickhouse_single_node(
57+
zone.id,
58+
ServiceName::Clickhouse,
59+
*address,
60+
)?;
61+
continue 'all_zones;
62+
}
63+
BlueprintZoneType::ClickhouseServer(
5364
blueprint_zone_type::ClickhouseServer { address, .. },
5465
) => {
5566
// Add the HTTP and native TCP interfaces for ClickHouse data
5667
// replicas. This adds the zone itself, so we need to continue
5768
// back up to the loop over all the Omicron zones, rather than
5869
// falling through to call `host_zone_with_one_backend()`.
59-
let http_service = if matches!(
60-
&zone.zone_type,
61-
BlueprintZoneType::Clickhouse(_)
62-
) {
63-
ServiceName::Clickhouse
64-
} else {
65-
ServiceName::ClickhouseServer
66-
};
67-
dns_builder.host_zone_clickhouse(
70+
dns_builder.host_zone_clickhouse_cluster(
6871
zone.id,
69-
http_service,
72+
ServiceName::ClickhouseServer,
7073
*address,
7174
)?;
7275
continue 'all_zones;

sled-agent/src/rack_setup/plan/service.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,11 @@ impl Plan {
607607
let http_port = omicron_common::address::CLICKHOUSE_HTTP_PORT;
608608
let http_address = SocketAddrV6::new(ip, http_port, 0, 0);
609609
dns_builder
610-
.host_zone_clickhouse(id, ServiceName::Clickhouse, http_address)
610+
.host_zone_clickhouse_single_node(
611+
id,
612+
ServiceName::Clickhouse,
613+
http_address,
614+
)
611615
.unwrap();
612616
let dataset_name =
613617
sled.alloc_dataset_from_u2s(DatasetKind::Clickhouse)?;

0 commit comments

Comments
 (0)