docker-container-watchdog/container_watchdog.py at master · hkotka/docker-container-watchdog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# pylint: disable = broad-except
import sys
import logging
import time
import json
import os
import re
import smtplib
from email.message import EmailMessage
import requests
import docker

# Set logging options and variables
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
polling_interval_after_restart: int = int(os.getenv('POLLING_INTERVAL_AFTER_RESTART', '600'))
polling_interval: int = int(os.getenv('POLLING_INTERVAL', '20'))
docker_host: str = os.getenv('DOCKER_HOSTMACHINE', 'UNKNOWN')
slack_webhook_url: str = os.getenv('SLACK_WEBHOOK_URL', '')
email_sender: str = os.getenv('EMAIL_SENDER', '')
email_receiver: str = os.getenv('EMAIL_RECEIVER', '')
smtp_server: str = os.getenv('SMTP_SERVER', '')
restarted_containers: list = []
notification_content: dict = {}

# Test and establish connection to docker socket
try:
    CLIENT = docker.from_env()
    CLIENT.version()
    logging.info("Connection to Docker socket OK")
except Exception as err:
    logging.fatal("%s", err)
    sys.exit()


def send_slack_message(content) -> None:
    if slack_webhook_url != "":
        try:
            requests.post(slack_webhook_url, data=json.dumps(content),
                          headers={'Content-Type': 'application/json'})
            logging.info("Message sent to Slack webhook: %s", content['text'])
        except (requests.exceptions.Timeout, ConnectionError) as err:
            logging.error("%s", err)


def send_smtp_message(content) -> None:
    if email_receiver != "" and smtp_server != "":
        email_content: str = re.sub('[^ :A-Za-z0-9]+', '', content)
        email_message = EmailMessage()
        email_message.set_content(email_content)
        email_message['Subject'] = 'Container Watchdog Alert notification'
        email_message['From'] = email_sender
        email_message['To'] = email_receiver
        mail = smtplib.SMTP(smtp_server, 25, timeout=40)
        try:
            mail.send_message(email_message)
            logging.info("Email sent to %s with content: %s", email_receiver, email_content)
        except Exception as err:
            logging.error("%s", err)
        mail.quit()


def get_container_health_status(container_object) -> str:
    try:
        health_status = container_object.attrs['State']['Health']['Status']
    except KeyError:
        health_status = 'nokey'
    return health_status


def restart_container(container_object) -> None:
    try:
        container_object.restart()
        logging.info("Restarted container: %s", container_object.name)
        notification_content['text'] = ("[Container watchdog]: has *_restarted_* container: [ *_{0}_* ] which had healthstatus: [ _{1}_ ] and state:"
                                        " [ _{2}_ ] on hostmachine [ _{3}_ ]".format(container_object.name, container_health_status, container_status, docker_host))
        if container_object.short_id not in restarted_containers:
            restarted_containers.append(container_object.short_id)
    except Exception as err:
        logging.fatal("%s", err)
        notification_content['text'] = ("[Container watchdog]: Docker daemon failed to restart container *{0}* on hostmachine *{1}*"
                                        " with error message: _{2}_".format(container_object.name, docker_host, err))


def container_recovered(container_object) -> None:
    logging.info("Container %s has recovered and is now healthy!", container_object.name)
    notification_content['text'] = ("[Container watchdog]: Container: [ *_{0}_* ] has *recovered* with healthstatus: [ _{1}_ ] and state: [ _{2}_ ]"
                                    " on hostmachine [ _{3}_ ]".format(container_object.name, container_health_status, container_status, docker_host))
    restarted_containers.remove(container_object.short_id)


# Run loop indefinetly polling every 30 seconds normally or in 15 minutes after watchdog has restarted a container.
while True:
    restart_status: bool = False
    container_list: list = CLIENT.containers.list()
    for container in container_list:
        container_status = container.status
        container_health_status: str = get_container_health_status(container)
        # Check if the container was restarted previously and is now healthy.
        # Send Slack/email notification. Remove from a list of restarted containers
        if container.short_id in restarted_containers and container_health_status == 'healthy':
            container_recovered(container)
            send_slack_message(notification_content)
            send_smtp_message(notification_content['text'])
        # If container is in unhealthy or exited status, restart and send Slack/Email notification.
        elif container_health_status == 'unhealthy':
            logging.error("Found container in unhealthy state! Container: %s has health status: %s and container status: %s",
                          container.name, container_health_status, container_status)
            restart_container(container)
            send_slack_message(notification_content)
            send_smtp_message(notification_content['text'])
            restart_status = True
        logging.debug('%s - %s - %s', container.name, container_health_status, container_status)

    # Wait to poll again, longer if restarts were done in previous loop
    if restart_status is True:
        logging.info("Waiting %s seconds until next polling, because container was restarted", polling_interval_after_restart)
        time.sleep(polling_interval_after_restart)
    elif restart_status is False:
        logging.info("All containers are in healthy state!")
        time.sleep(polling_interval)