-
Notifications
You must be signed in to change notification settings - Fork 976
Description
Hello.
We are experiencing a recurring issue with dnsdist 2.0.1 running on Ubuntu 22.04 (IPv6 fully disabled on kernel level via GRUB).
Approximately once every 1–2 months, all three recursor backends in the recursor pool suddenly start failing health checks at the same time. dnsdist marks all three recursors as down with continuous ServFail responses.
Only the recursor pool is affected — authoritative pools and DNSCrypt pools continue working normally.
The problem persists until dnsdist or the recursors are restarted manually.
Environment details
- dnsdist version: 2.0.1
- OS: Ubuntu Server 22.04 LTS
- IPv6: Disabled via GRUB_CMDLINE_LINUX_DEFAULT="ipv6.disable=1"
Topology:
- 3× dnsdist instances
- Connected via keepalived to a single shared VIP
- Identical configuration on all three nodes
Recursor backends:
- 10.18.41.6:5301
- 10.18.41.7:5301
- 10.18.41.8:5301
Recursors themselves are configured very simply.
Problem description
Once every ~30–60 days, all three recursors begin failing health checks simultaneously.
dnsdist logs show continuous errors like:
Backend 10.18.41.6:5301 responded to health check with ServFail
Backend 10.18.41.7:5301 responded to health check with ServFail
Backend 10.18.41.8:5301 responded to health check with ServFail
Repeat every 5 seconds.
This state is restored automatically after 10 minutes.
Restarting either dnsdist or the recursor instances immediately resolves the issue.
Example of log output
Dec 12 07:57:48 dnsdist: Backend 10.18.41.6:5301 responded to health check with ServFail
Dec 12 07:57:48 dnsdist: Backend 10.18.41.8:5301 responded to health check with ServFail
Dec 12 07:57:48 dnsdist: Backend 10.18.41.7:5301 responded to health check with ServFail
Dec 12 07:57:53 dnsdist: Backend 10.18.41.6:5301 responded to health check with ServFail
Dec 12 07:57:53 dnsdist: Backend 10.18.41.8:5301 responded to health check with ServFail
Dec 12 07:57:53 dnsdist: Backend 10.18.41.7:5301 responded to health check with ServFail
... repeating infinitely
Full dnsdist configuration
setLocal('127.0.0.1')
addLocal('10.18.41.2')
addLocal('10.18.41.3')
addLocal('10.18.41.34')
addLocal('10.18.41.35')
setACL({'0.0.0.0/0'})
setSyslogFacility(0)
setVerboseHealthChecks(true)
controlSocket('127.0.0.1:5199')
setConsoleACL('127.0.0.1')
setKey("ej0NCggL+KefBk60w5BdJ/4v9RU42hNp2CrU/awltHg=")
setVerboseHealthChecks(true)
setRoundRobinFailOnNoServer(true)
setServerPolicy(roundrobin)
addDOHLocal("10.18.41.2:443",
"/opt/doh_sert/fullchain.pem",
"/opt/doh_sert/key.key")
pc = newPacketCache(4000000, {
maxTTL=86400,
minTTL=0,
temporaryFailureTTL=60,
staleTTL=60,
dontAge=false
})
tap_logging = newFrameStreamTcpLogger("10.18.41.15:6000")
addAction(AllRule(), DnstapLogAction("dnsdist_server", tap_logging))
newServer({
address="10.18.41.12:5454",
pool="ext-dnscrypt"
})
newServer({
address="10.18.41.13:5454",
pool="ext-dnscrypt"
})
newServer({
address="10.18.41.14:5454",
pool="ext-dnscrypt"
})
specific_domains = newSuffixMatchNode()
-- specific_domains:add(newDNSName("mt.fppk.com"))
-- specific_domains:add(newDNSName("mx.fppk.com"))
addAction(
AndRule({
NetmaskGroupRule({'172.16.0.0/12'}),
SuffixMatchNodeRule(specific_domains)
}),
PoolAction("ext-dnscrypt")
)
mgc_lan_domains = newSuffixMatchNode()
mgc_lan_domains:add(newDNSName("mgc.lan"))
addAction(
AndRule({
NetmaskGroupRule({'10.18.24.0/21'}),
SuffixMatchNodeRule(mgc_lan_domains)
}),
PoolAction("geoip-auth-primary")
)
-- Пул гостевых вайфай
addAction(
NetmaskGroupRule({'10.0.200.0/24'}),
PoolAction("ext-dnscrypt")
)
newServer({
address='10.18.41.6:5301',
pool='recursor',
checkName='google.com.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2,
sockets=16
})
newServer({
address='10.18.41.7:5301',
pool='recursor',
checkName='google.com.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.41.8:5301',
pool='recursor',
checkName='google.com.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.0.2:53',
pool='auth-primary',
checkName='mgc.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.41.20:5304',
pool='geoip-auth-primary',
checkName='vm1.mgc.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.20.1.2:53',
pool='lcc1-auth-primary',
checkName='dc.lcc1.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.1.2:53',
pool='hhc-auth-primary',
checkName='dc.hhc.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.0.241.2:53',
pool='local-auth-primary',
checkName='s-f-dc.fx.local.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.20.4.42:53',
pool='pr-auth-primary',
checkName='dc.pr.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.2.2:53',
pool='bccb-auth-primary',
checkName='dc.bccb.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.42.0.2:53',
pool='media-auth-primary',
checkName='dc.media.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.235.2:53',
pool='ahcc-auth-primary',
checkName='dc.ahcc.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.236.2:53',
pool='afcc-auth-primary',
checkName='dc.afcc.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.20.11.2:53',
pool='est-auth-primary',
checkName='dc.est.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.18.234.2:53',
pool='accg-auth-primary',
checkName='dc.accg.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.41.52.2:53',
pool='gds-auth-primary',
checkName='dc.gds.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({
address='10.41.192.2:53',
pool='iqnd-auth-primary',
checkName='dc.iqnd.lan.',
mustResolve=true,
checkType="A",
checkInterval=5,
checkTimeout=1,
maxCheckFailures=3,
rise=2
})
newServer({address='10.18.41.9:5300', pool='auth'})
-- newServer({address='10.18.41.10:5300', pool='auth'})
newServer({address='10.18.41.11:5300', pool='auth'})
getPool("recursor"):setCache(pc)
getPool("auth"):setCache(pc)
getPool("ext-dnscrypt"):setCache(pc)
getPool(""):setCache(pc)
msdcs_domain = newSuffixMatchNode()
msdcs_domain:add(newDNSName("_msdcs.mgc.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain), PoolAction('auth-primary'))
msdcs_domain_est = newSuffixMatchNode()
msdcs_domain_est:add(newDNSName("_msdcs.est.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_est), PoolAction('est-auth-primary'))
msdcs_domain_lcc1 = newSuffixMatchNode()
msdcs_domain_lcc1:add(newDNSName("_msdcs.lcc1.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_lcc1), PoolAction('lcc1-auth-primary'))
msdcs_domain_accg = newSuffixMatchNode()
msdcs_domain_accg:add(newDNSName("_msdcs.accg.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_accg), PoolAction('accg-auth-primary'))
msdcs_domain_iqnd = newSuffixMatchNode()
msdcs_domain_iqnd:add(newDNSName("_msdcs.iqnd.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_iqnd), PoolAction('iqnd-auth-primary'))
msdcs_domain_media = newSuffixMatchNode()
msdcs_domain_media:add(newDNSName("_msdcs.media.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_media), PoolAction('media-auth-primary'))
msdcs_domain_ahcc = newSuffixMatchNode()
msdcs_domain_ahcc:add(newDNSName("_msdcs.ahcc.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_ahcc), PoolAction('ahcc-auth-primary'))
msdcs_domain_afcc = newSuffixMatchNode()
msdcs_domain_afcc:add(newDNSName("_msdcs.afcc.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_afcc), PoolAction('afcc-auth-primary'))
msdcs_domain_hhc = newSuffixMatchNode()
msdcs_domain_hhc:add(newDNSName("_msdcs.hhc.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_hhc), PoolAction('hhc-auth-primary'))
msdcs_domain_local = newSuffixMatchNode()
msdcs_domain_local:add(newDNSName("_msdcs.fx.local"))
addAction(SuffixMatchNodeRule(msdcs_domain_local), PoolAction('local-auth-primary'))
msdcs_domain_bccb = newSuffixMatchNode()
msdcs_domain_bccb:add(newDNSName("_msdcs.bccb.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_bccb), PoolAction('bccb-auth-primary'))
msdcs_domain_pr = newSuffixMatchNode()
msdcs_domain_pr:add(newDNSName("_msdcs.pr.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_pr), PoolAction('pr-auth-primary'))
msdcs_domain_gs = newSuffixMatchNode()
msdcs_domain_gs:add(newDNSName("_msdcs.gs.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_gs), PoolAction('gs-auth-primary'))
msdcs_domain_pr = newSuffixMatchNode()
msdcs_domain_pr:add(newDNSName("pr.lan"))
addAction(SuffixMatchNodeRule(msdcs_domain_pr), PoolAction('auth-primary'))
domain_names = newSuffixMatchNode()
domain_names:add(newDNSName("lan"))
-- addAction(SuffixMatchNodeRule(msdcs_domain), PoolAction('auth-primary'))
addAction(SuffixMatchNodeRule(domain_names), PoolAction('auth'))
addAction(AllRule(), PoolAction('recursor'))
-- tap_logging = newFrameStreamTcpLogger("10.18.20.145:6009")
-- addAction(AllRule(), DnstapLogAction("dnsdist_server", tap_logging))
webserver("0.0.0.0:8083")
setWebserverConfig({
password="$scrypt$ln=10,p=1,r=8$RSYJ2QDmdlkNYMyqZF/FWw==$JQTftQCvAXR4Qtrg0lQmvrzgYEo3/PjEeuV4/2Oq1Vg=",
apiKey="$scrypt$ln=10,p=1,r=8$RSYJ2QDmdlkNYMyqZF/FWw==$JQTftQCvAXR4Qtrg0lQmvrzgYEo3/PjEeuV4/2Oq1Vg=",
acl="10.0.0.0/8"
})
Any advice is very welcome.
Thank you!