From e7762f9c073cd7a272c44b6327518eae881c74c6 Mon Sep 17 00:00:00 2001 From: Fabian Holler Date: Tue, 22 Oct 2024 17:11:13 +0200 Subject: [PATCH 1/4] events: react on Action field instead of deprecated Event field --- go.mod | 2 +- registrator.go | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 74c6c712..2b2df838 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.23.2 require ( github.com/coreos/go-etcd v2.0.0+incompatible + github.com/docker/docker v27.3.1+incompatible github.com/fsouza/go-dockerclient v1.12.0 github.com/gliderlabs/pkg v0.0.0-20161206023812-36f28d47ec7a github.com/hashicorp/consul/api v1.29.5 @@ -19,7 +20,6 @@ require ( github.com/armon/go-metrics v0.4.1 // indirect github.com/containerd/log v0.1.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/docker/docker v27.3.1+incompatible // indirect github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/fatih/color v1.16.0 // indirect diff --git a/registrator.go b/registrator.go index 02523e05..7784f9c9 100644 --- a/registrator.go +++ b/registrator.go @@ -10,6 +10,7 @@ import ( "strings" "time" + dockerevents "github.com/docker/docker/api/types/events" dockerapi "github.com/fsouza/go-dockerclient" "github.com/gliderlabs/pkg/usage" "github.com/simplesurance/registrator/bridge" @@ -174,12 +175,12 @@ func main() { // Process Docker events for msg := range events { - switch msg.Status { - case "start": + switch dockerevents.Action(msg.Action) { + case dockerevents.ActionStart, dockerevents.ActionUnPause: go b.Add(msg.ID) - case "die": + case dockerevents.ActionDie: go b.RemoveOnExit(msg.ID) - case "kill": + case dockerevents.ActionKill: if *deregisterOnStop { go b.RemoveOnExit(msg.ID) } From 47ae093026f0894e202fa7ea7a37d912905f4dc2 Mon Sep 17 00:00:00 2001 From: Fabian Holler Date: Tue, 22 Oct 2024 17:14:43 +0200 Subject: [PATCH 2/4] fix: docker events can get lost A channel without buffer was used to receive dockerapi events. Events are only sent to the channel if the operation does not block. This means when registrator is still processing an event, events send to the channel will get lost. Make it unlikely to happen by using a channel with a buffer of 128 messages. --- registrator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/registrator.go b/registrator.go index 7784f9c9..c49a50a0 100644 --- a/registrator.go +++ b/registrator.go @@ -133,7 +133,7 @@ func main() { } // Start event listener before listing containers to avoid missing anything - events := make(chan *dockerapi.APIEvents) + events := make(chan *dockerapi.APIEvents, 128) assert(docker.AddEventListener(events)) log.Println("Listening for Docker events ...") From 103eadb5c9ae02964f6616b7ddea26580751ee09 Mon Sep 17 00:00:00 2001 From: Fabian Holler Date: Tue, 22 Oct 2024 16:45:05 +0200 Subject: [PATCH 3/4] only register containers that are in running and restarting state During Sync() registrator is registering all docker containers including containers that aren't running. Afterwards it unregisters all containers again aren't in status created, restarting, running or paused. Change it to only register containers that are running or restarting. It's not clear why containers are registered that are not running at all. This will allow remove the logic to mark containers as dying, the issue that containers can be reregistered during shutdown can not happen anymore. --- bridge/bridge.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/bridge/bridge.go b/bridge/bridge.go index 79c4c192..6965400b 100644 --- a/bridge/bridge.go +++ b/bridge/bridge.go @@ -94,18 +94,19 @@ func (b *Bridge) Sync(quiet bool) { b.Lock() defer b.Unlock() - containers, err := b.docker.ListContainers(dockerapi.ListContainersOptions{}) + filters := map[string][]string{"status": {"running", "restarting"}} + runningContainers, err := b.docker.ListContainers(dockerapi.ListContainersOptions{Filters: filters}) if err != nil && quiet { - log.Println("error listing containers, skipping sync") + log.Println("skipping sync, listing containers failed:", err) return } else if err != nil && !quiet { log.Fatal(err) } - log.Printf("Syncing services on %d containers", len(containers)) + log.Printf("Syncing services on %d running containers", len(runningContainers)) // NOTE: This assumes reregistering will do the right thing, i.e. nothing.. - for _, listing := range containers { + for _, listing := range runningContainers { services := b.services[listing.ID] if services == nil { b.add(listing.ID, quiet) @@ -124,15 +125,9 @@ func (b *Bridge) Sync(quiet bool) { if b.config.Cleanup { // Remove services if its corresponding container is not running log.Println("Listing non-exited containers") - filters := map[string][]string{"status": {"created", "restarting", "running", "paused"}} - nonExitedContainers, err := b.docker.ListContainers(dockerapi.ListContainersOptions{Filters: filters}) - if err != nil { - log.Println("error listing nonExitedContainers, skipping sync", err) - return - } for listingId := range b.services { found := false - for _, container := range nonExitedContainers { + for _, container := range runningContainers { if listingId == container.ID { found = true break @@ -140,7 +135,7 @@ func (b *Bridge) Sync(quiet bool) { } // This is a container that does not exist if !found { - log.Printf("stale: Removing service %s because it does not exist", listingId) + log.Printf("stale: Removing service %s because it's container is not running ", listingId) go b.RemoveOnExit(listingId) } } From 17c55df181737c803f0616c9aac1b8bc62af3a6e Mon Sep 17 00:00:00 2001 From: Fabian Holler Date: Tue, 22 Oct 2024 16:53:36 +0200 Subject: [PATCH 4/4] remove tracking stopped containers as dying Do not mark containers as dying after they have been stopped. The mechanic causes that when a single container is restarted it is not registered again. The containers is marked as dying until ttl-dying-cleanup expired. If the container is restarted before the TTL expires it ignored and not registered again. Containers are also only unmarked as dying when bridge.add() is called, if this did not happen despite the TTL expires and the container was restarted it was also not registered again. I guess the dying state was introduced to prevent that a container is reregistered while it is being shutdown. Because Sync() was changed to only consider containers in running state it can not happen anymore. The ttl-dying-cleanup config option is removed. --- bridge/bridge.go | 41 ++++++++++------------------------------- bridge/types.go | 1 - registrator.go | 2 -- 3 files changed, 10 insertions(+), 34 deletions(-) diff --git a/bridge/bridge.go b/bridge/bridge.go index 6965400b..2c5e0c19 100644 --- a/bridge/bridge.go +++ b/bridge/bridge.go @@ -11,7 +11,6 @@ import ( "strconv" "strings" "sync" - "time" dockerapi "github.com/fsouza/go-dockerclient" ) @@ -20,12 +19,11 @@ var serviceIDPattern = regexp.MustCompile(`^(.+?):([a-zA-Z0-9][a-zA-Z0-9_.-]+):[ type Bridge struct { sync.Mutex - registry RegistryAdapter - docker *dockerapi.Client - services map[string][]*Service - deadContainers map[string]*DeadContainer - dyingContainers map[string]time.Time - config Config + registry RegistryAdapter + docker *dockerapi.Client + services map[string][]*Service + deadContainers map[string]*DeadContainer + config Config } func New(docker *dockerapi.Client, adapterUri string, config Config) (*Bridge, error) { @@ -40,12 +38,11 @@ func New(docker *dockerapi.Client, adapterUri string, config Config) (*Bridge, e log.Println("Using", uri.Scheme, "adapter:", uri) return &Bridge{ - docker: docker, - config: config, - registry: factory.New(uri), - services: make(map[string][]*Service), - deadContainers: make(map[string]*DeadContainer), - dyingContainers: make(map[string]time.Time), + docker: docker, + config: config, + registry: factory.New(uri), + services: make(map[string][]*Service), + deadContainers: make(map[string]*DeadContainer), }, nil } @@ -179,11 +176,6 @@ func (b *Bridge) Sync(quiet bool) { } func (b *Bridge) add(containerId string, quiet bool) { - if _, ok := b.dyingContainers[containerId]; ok { - log.Println("container, ", containerId[:12], ", is dying, ignoring") - return - } - if d := b.deadContainers[containerId]; d != nil { b.services[containerId] = d.Services delete(b.deadContainers, containerId) @@ -384,7 +376,6 @@ func (b *Bridge) remove(containerId string, deregister bool) { b.deadContainers[containerId] = &DeadContainer{b.config.RefreshTtl, b.services[containerId]} } delete(b.services, containerId) - b.markContainerAsDying(containerId) } // bit set on ExitCode if it represents an exit via a signal @@ -418,18 +409,6 @@ func (b *Bridge) shouldRemove(containerId string) bool { return false } -func (b *Bridge) markContainerAsDying(containerId string) { - // cleanup after CleanupDyingTtl - for containerId, t := range b.dyingContainers { - if time.Since(t) >= time.Millisecond*time.Duration(b.config.CleanupDyingTtl) { - delete(b.dyingContainers, containerId) - } - } - - // mark container as "dying" - b.dyingContainers[containerId] = time.Now() -} - var Hostname string func init() { diff --git a/bridge/types.go b/bridge/types.go index 3c95d148..e643ed3f 100644 --- a/bridge/types.go +++ b/bridge/types.go @@ -29,7 +29,6 @@ type Config struct { RefreshInterval int DeregisterCheck string Cleanup bool - CleanupDyingTtl int } type Service struct { diff --git a/registrator.go b/registrator.go index c49a50a0..b547ca57 100644 --- a/registrator.go +++ b/registrator.go @@ -34,7 +34,6 @@ var ( retryAttempts = flag.Int("retry-attempts", 0, "Max retry attempts to establish a connection with the backend. Use -1 for infinite retries") retryInterval = flag.Int("retry-interval", 2000, "Interval (in millisecond) between retry-attempts.") cleanup = flag.Bool("cleanup", false, "Remove dangling services") - cleanupDyingTtl = flag.Int("ttl-dying-cleanup", 60000, "TTL (in millisecond) for cleaning dying containers cache") ) func assert(err error) { @@ -110,7 +109,6 @@ func main() { RefreshInterval: *refreshInterval, DeregisterCheck: *deregister, Cleanup: *cleanup, - CleanupDyingTtl: *cleanupDyingTtl, }) assert(err)